-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 19a100f
Showing
25 changed files
with
1,226 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
.venv/ | ||
data/ | ||
.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
database: | ||
sql: | ||
uri: !ENV 'sqlite:///feature_store.db' | ||
chunksize: 10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# CryptoML | ||
|
||
Current functionality: | ||
- Build datasets from bootstrap data | ||
|
||
Backlog: | ||
- Update datasets with new data records |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
typer | ||
xgboost | ||
pandas | ||
statsmodels | ||
scikit-learn | ||
scipy | ||
numpy | ||
requests | ||
pyyaml | ||
git+https://github.com/RedLicorice/pyti.git | ||
sqlalchemy | ||
confuse | ||
python-dotenv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import typer | ||
import xgboost | ||
app = typer.Typer() | ||
|
||
|
||
@app.command(name='bootstrap', help='Bootstrap dataset with data from zip files in data/bootstrap') | ||
def build_dataset(symbol: str, currency: str): | ||
target_name = '../data/dataset-{symbol}{currency}'.format(symbol=symbol, currency=currency) | ||
from crawlers import kraken, coinmetrics | ||
_kraken = kraken.get_bootstrap_data(symbol, currency).fillna(method='ffill') | ||
_coinmetrics = coinmetrics.get_bootstrap_data(symbol) | ||
|
||
ohlcv = kraken.ticks_to_ohlcv(_kraken, '1D').fillna(method='ffill') | ||
|
||
from dataset import build, get_feature_metadata, make_target | ||
import pandas as pd | ||
result = build(ohlcv=ohlcv, coinmetrics=_coinmetrics, W=10) | ||
result.to_csv(target_name + '.csv', index_label='timestamp') | ||
|
||
_begin, _end, _features = get_feature_metadata(result) | ||
meta = pd.DataFrame.from_records(_features) | ||
meta.index = meta['name'] | ||
meta.drop(labels='name', axis='columns', inplace=True) | ||
meta.to_csv(target_name + '.meta.csv', index_label='feature') | ||
|
||
target = make_target(ohlcv) | ||
target.to_csv(target_name + '.target.csv', index_label='timestamp') | ||
|
||
info = { | ||
'symbol': symbol, | ||
'currency': currency, | ||
'interval': '1D', | ||
'records': result.shape[0], | ||
'features': result.shape[1], | ||
'index_min': result.index.min().to_pydatetime().isoformat(), | ||
'index_max': result.index.max().to_pydatetime().isoformat(), | ||
'valid_index_min': _begin, | ||
'valid_index_max': _end, | ||
'targets': {str(k): False if k != 'class' else True for k in target.columns}, | ||
'features': {str(k): True for k in meta.index} | ||
} | ||
with open(target_name + '.info.yaml', 'w') as f: | ||
import yaml | ||
yaml.dump(info, f, sort_keys=False) | ||
print('done') | ||
|
||
@app.command(name='selection', help='Perform feature selection and update <dataset>.info.yaml with selected features') | ||
def selection(symbol: str, currency: str, percent: float): | ||
target_name = '../data/dataset-{symbol}{currency}'.format(symbol=symbol, currency=currency) | ||
import pandas as pd | ||
import math | ||
from crawlers import load_yaml, save_yaml | ||
|
||
info = load_yaml(target_name + '.info.yaml') | ||
dataset = pd.read_csv(target_name + '.csv', parse_dates=True, index_col='timestamp') | ||
target = pd.read_csv(target_name + '.target.csv', parse_dates=True, index_col='timestamp') | ||
first_valid_i = dataset.index.get_loc(info.valid_index_min) | ||
last_valid_i = dataset.index.get_loc(info.valid_index_max) | ||
|
||
training_records = math.floor((last_valid_i - first_valid_i) * percent) | ||
dataset['label'] = target['class'] | ||
training_dataset = dataset.iloc[first_valid_i:first_valid_i+training_records] | ||
# testing_dataset = dataset.iloc[first_valid_i+training_records: last_valid_i+1] | ||
|
||
from xgboost import XGBClassifier | ||
from util.selection_pipeline import Pipeline | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.preprocessing import StandardScaler | ||
pipeline = Pipeline(steps=[ | ||
('s', StandardScaler()), | ||
('i', SimpleImputer()), | ||
('c', XGBClassifier(use_label_encoder=False)) | ||
]) | ||
|
||
X_train = training_dataset.drop(labels=['label'], axis='columns') | ||
with pd.option_context('mode.use_inf_as_na', True): # Set option temporarily | ||
X_train.fillna(axis='columns', method='ffill', inplace=True) | ||
y_train = training_dataset['label'] | ||
|
||
from sklearn.feature_selection import SelectFromModel | ||
sel = SelectFromModel(pipeline) | ||
sel.fit(X_train, y_train) | ||
support = sel.get_support() | ||
|
||
dinfo = info.to_dict() | ||
for c, mask in zip(X_train.columns, support): | ||
dinfo['features'][c] = True if mask else False | ||
|
||
import yaml | ||
with open(target_name + '.info.yaml', 'w') as f: | ||
yaml.dump(dinfo, f, sort_keys=False) | ||
with open(target_name + '.info.yaml.bak', 'w') as f: | ||
yaml.dump(info, f, sort_keys=False) | ||
print('done') | ||
|
||
|
||
|
||
@app.command() | ||
def test(symbol: str, currency: str): | ||
from dataset import make_ohlcv_ta | ||
from crawlers import kraken | ||
_kraken = kraken.get_bootstrap_data(symbol, currency).fillna(method='ffill') | ||
ohlcv = kraken.ticks_to_ohlcv(_kraken, '1D').fillna(method='ffill') | ||
ohlcv_ta = make_ohlcv_ta(ohlcv) | ||
print('It works') | ||
print(ohlcv_ta.head()) | ||
|
||
if __name__ == '__main__': | ||
app() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import confuse | ||
from dotenv import load_dotenv | ||
import re, os, yaml | ||
|
||
# Load Configuration | ||
env_variable_pattern = re.compile('.*?\${(\w+)}.*?') | ||
_configfile = None | ||
load_dotenv() | ||
|
||
def replace_env_variables(loader, node): | ||
""" | ||
Extracts the environment variable from the node's value | ||
:param yaml.Loader loader: the yaml loader | ||
:param node: the current node in the yaml | ||
:return: the parsed string that contains the value of the environment | ||
variable | ||
""" | ||
value = loader.construct_scalar(node) | ||
match = env_variable_pattern.findall(value) # to find all env variables in line | ||
if match: | ||
full_value = value | ||
for g in match: | ||
full_value = full_value.replace( | ||
f'${{{g}}}', os.environ.get(g, g) | ||
) | ||
return full_value if not full_value.isnumeric() else int(full_value) | ||
return value | ||
|
||
confuse.Loader.add_constructor('!ENV', replace_env_variables) | ||
#config = confuse.Configuration('CryptoML-API', __name__) | ||
config = confuse.LazyConfig('CryptoML', __name__) | ||
config.set_file('../config.yaml') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import yaml | ||
from util.bunch import Bunch | ||
|
||
|
||
class Spec(dict): | ||
def __init__(self, base_url, endpoints, **kwargs): | ||
if not type(endpoints) is dict: | ||
raise ValueError('Endpoints must be a dictionary!') | ||
self.base_url = base_url | ||
self.endpoints = endpoints | ||
super().__init__(kwargs) | ||
|
||
@staticmethod | ||
def from_dict(spec): | ||
if not spec: | ||
raise ValueError('Provided spec is invalid.') | ||
if not 'base_url' in spec: | ||
raise ValueError('Provided spec does not describe a base url.') | ||
res = Spec(base_url=spec['base_url'], endpoints=spec['endpoints']) | ||
return res | ||
|
||
def __dir__(self): | ||
return self.endpoints.keys() | ||
|
||
def __setstate__(self, state): | ||
pass | ||
|
||
def __setattr__(self, key, value): | ||
if key in ['endpoints', 'base_url']: | ||
self[key] = value | ||
else: | ||
self.endpoints[key] = value | ||
|
||
def __getattr__(self, key): | ||
if key in ['endpoints', 'base_url']: | ||
return self[key] | ||
try: | ||
query = self.endpoints[key] | ||
if '{' in query and '}' in query: | ||
return (self.base_url + query).format | ||
return self.base_url + query | ||
except KeyError: | ||
raise AttributeError('Method not described in api spec: ' + key) | ||
|
||
|
||
def load_api_spec(filename): | ||
with open(filename, 'r') as f: | ||
try: | ||
spec = yaml.safe_load(f) | ||
except yaml.YAMLError as exc: | ||
print('error loading api spec' + exc) | ||
if spec: | ||
return Spec.from_dict(spec) | ||
|
||
def load_yaml(filename): | ||
with open(filename, 'r') as f: | ||
try: | ||
spec = yaml.safe_load(f) | ||
except yaml.YAMLError as exc: | ||
print('error loading yaml' + exc) | ||
if spec: | ||
return Bunch(**spec) | ||
|
||
def save_yaml(filename, data): | ||
with open(filename, 'w') as f: | ||
try: | ||
yaml.safe_dump(data=data, stream=f) | ||
except yaml.YAMLError as exc: | ||
print('error saving yaml' + exc.message or str(exc)) | ||
|
||
def load_bootstrap(zip_file, csv_file): | ||
import zipfile | ||
import pandas as pd | ||
with zipfile.ZipFile(zip_file) as z: | ||
with z.open(csv_file) as f: | ||
train = pd.read_csv(f, delimiter=",", parse_dates=True, index_col='date') | ||
return train | ||
|
||
def bootstrap_index(filename): | ||
index = load_yaml(filename) | ||
return index.bootstrap | ||
|
||
def load_transformer(filename): | ||
import importlib | ||
import ntpath | ||
|
||
spec = importlib.util.spec_from_file_location("bootstrap."+ntpath.basename(filename)[:-3], filename) | ||
transformer = importlib.util.module_from_spec(spec) | ||
spec.loader.exec_module(transformer) | ||
return transformer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
base_url: 'https://community-api.coinmetrics.io/v4' | ||
api_key: '' | ||
endpoints: | ||
asset_metadata: '/catalog/assets?assets={assets}&pretty=false' | ||
metrics_timeseries: '/timeseries/asset-metrics?assets={assets}&metrics={metrics}&frequency={frequency}&status=all&start_time={begin}&end_time={end}&end_inclusive=true&timezone=Europe/Rome' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
def api(): | ||
from . import load_api_spec | ||
return load_api_spec('crawlers/coinmetrics-community.yaml') | ||
|
||
def get_assets(assets): | ||
if type(assets) is list: | ||
assets = ','.join(assets) | ||
import requests | ||
resp = requests.get(api().asset_metadata(assets=assets)) | ||
return resp.json()['data'] | ||
|
||
def get_asset_features(asset_name, frequency): | ||
assets = get_assets(asset_name) | ||
result = [] | ||
id = 1 | ||
for asset in assets: | ||
for metric in asset['metrics']: | ||
for f in metric['frequencies']: | ||
if frequency == f['frequency']: | ||
result.append({'index': id, 'dataset': 'coinmetrics', 'asset': asset['asset'], 'name': metric['metric'], 'min': f['min_time'], 'max': f['max_time'], 'enabled': True}) | ||
id += 1 | ||
return result | ||
|
||
def get_asset_metrics(asset_name, metrics, frequency, begin, end): | ||
if type(metrics) is list: | ||
metrics = ','.join(metrics) | ||
import requests | ||
resp = requests.get(api().metrics_timeseries(assets=asset_name, metrics=metrics, frequency=frequency, begin=begin, end=end)) | ||
rj = resp.json() | ||
result = rj['data'] | ||
import time | ||
if 'next_page_url' in rj: | ||
while True: | ||
time.sleep(0.5) | ||
resp = requests.get(rj['next_page_url']) | ||
rj = resp.json() | ||
result += rj['data'] | ||
if not 'next_page_url' in rj: | ||
break | ||
return result | ||
|
||
|
||
def get_bootstrap_data(symbol): | ||
symbol = symbol.lower() | ||
|
||
from . import bootstrap_index, load_transformer | ||
try: | ||
index = bootstrap_index('../data/bootstrap/index.yaml') | ||
transformer = load_transformer('../data/bootstrap/' + index.coinmetrics.transformer) | ||
if symbol not in index.coinmetrics.groups: | ||
filename = index.coinmetrics.name_format.format(symbol=symbol) + '.csv' | ||
return transformer.get_df('../data/bootstrap/' + index.coinmetrics.zipfile, filename) | ||
else: | ||
filenames = [index.coinmetrics.name_format.format(symbol=symbol) + '.csv'] | ||
filenames += [ name + '.csv' for name in index.coinmetrics.groups[symbol]] | ||
dataframes = [transformer.get_df('../data/bootstrap/' + index.coinmetrics.zipfile, filename) for filename in filenames] | ||
|
||
import pandas as pd | ||
return pd.concat(dataframes) | ||
except Exception as e: | ||
print('Exception occurred! ' + str(e)) | ||
raise |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
def api(): | ||
from . import load_api_spec | ||
return load_api_spec('crawlers/kraken.yaml') | ||
|
||
def get_pair_ohlc(pair, metrics, frequency, begin, end): | ||
global kraken | ||
if type(metrics) is list: | ||
metrics = ','.join(metrics) | ||
import requests | ||
resp = requests.get(api().ohlc_data(assets=pair, metrics=metrics, frequency=frequency, since=0)) | ||
rj = resp.json() | ||
result = rj['data'] | ||
import time | ||
if 'next_page_url' in rj: | ||
while True: | ||
time.sleep(0.5) | ||
resp = requests.get(rj['next_page_url']) | ||
rj = resp.json() | ||
result += rj['data'] | ||
if not 'next_page_url' in rj: | ||
break | ||
return result | ||
|
||
|
||
def get_bootstrap_data(symbol, currency): | ||
from . import bootstrap_index, load_transformer | ||
_convert_map = { | ||
'btc':'xbt', | ||
'doge':'xdg' | ||
} | ||
if symbol in _convert_map: | ||
symbol = _convert_map[symbol] | ||
if currency in _convert_map: | ||
currency = _convert_map[currency] | ||
try: | ||
index = bootstrap_index('../data/bootstrap/index.yaml') | ||
transformer = load_transformer('../data/bootstrap/' + index.kraken.transformer) | ||
if index.kraken.groups: | ||
raise ValueError('Groups are not supported for kraken Loader') | ||
filename = index.kraken.name_format.format(symbol=symbol.upper(), currency=currency.upper()) + '.csv' | ||
return transformer.get_df('../data/bootstrap/' + index.kraken.zipfile, filename) | ||
except Exception as e: | ||
print('Exception occurred! ' + str(e)) | ||
raise | ||
|
||
|
||
def ticks_to_ohlcv(ticks, interval): | ||
resample = ticks.resample(interval) | ||
ohlc = resample['price'].ohlc() | ||
ohlc['volume'] = resample['amount'].sum() | ||
return ohlc |
Oops, something went wrong.