diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..beb509c --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.venv/ +data/ +.env diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..01f37e2 --- /dev/null +++ b/config.yaml @@ -0,0 +1,4 @@ +database: + sql: + uri: !ENV 'sqlite:///feature_store.db' + chunksize: 10 \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..897aa3e --- /dev/null +++ b/readme.md @@ -0,0 +1,7 @@ +# CryptoML + +Current functionality: + - Build datasets from bootstrap data + +Backlog: + - Update datasets with new data records \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9e5cafb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +typer +xgboost +pandas +statsmodels +scikit-learn +scipy +numpy +requests +pyyaml +git+https://github.com/RedLicorice/pyti.git +sqlalchemy +confuse +python-dotenv \ No newline at end of file diff --git a/src/cli.py b/src/cli.py new file mode 100644 index 0000000..5798173 --- /dev/null +++ b/src/cli.py @@ -0,0 +1,109 @@ +import typer +import xgboost +app = typer.Typer() + + +@app.command(name='bootstrap', help='Bootstrap dataset with data from zip files in data/bootstrap') +def build_dataset(symbol: str, currency: str): + target_name = '../data/dataset-{symbol}{currency}'.format(symbol=symbol, currency=currency) + from crawlers import kraken, coinmetrics + _kraken = kraken.get_bootstrap_data(symbol, currency).fillna(method='ffill') + _coinmetrics = coinmetrics.get_bootstrap_data(symbol) + + ohlcv = kraken.ticks_to_ohlcv(_kraken, '1D').fillna(method='ffill') + + from dataset import build, get_feature_metadata, make_target + import pandas as pd + result = build(ohlcv=ohlcv, coinmetrics=_coinmetrics, W=10) + result.to_csv(target_name + '.csv', index_label='timestamp') + + _begin, _end, _features = get_feature_metadata(result) + meta = pd.DataFrame.from_records(_features) + meta.index = meta['name'] + meta.drop(labels='name', axis='columns', inplace=True) + meta.to_csv(target_name + '.meta.csv', index_label='feature') + + target = make_target(ohlcv) + target.to_csv(target_name + '.target.csv', index_label='timestamp') + + info = { + 'symbol': symbol, + 'currency': currency, + 'interval': '1D', + 'records': result.shape[0], + 'features': result.shape[1], + 'index_min': result.index.min().to_pydatetime().isoformat(), + 'index_max': result.index.max().to_pydatetime().isoformat(), + 'valid_index_min': _begin, + 'valid_index_max': _end, + 'targets': {str(k): False if k != 'class' else True for k in target.columns}, + 'features': {str(k): True for k in meta.index} + } + with open(target_name + '.info.yaml', 'w') as f: + import yaml + yaml.dump(info, f, sort_keys=False) + print('done') + +@app.command(name='selection', help='Perform feature selection and update .info.yaml with selected features') +def selection(symbol: str, currency: str, percent: float): + target_name = '../data/dataset-{symbol}{currency}'.format(symbol=symbol, currency=currency) + import pandas as pd + import math + from crawlers import load_yaml, save_yaml + + info = load_yaml(target_name + '.info.yaml') + dataset = pd.read_csv(target_name + '.csv', parse_dates=True, index_col='timestamp') + target = pd.read_csv(target_name + '.target.csv', parse_dates=True, index_col='timestamp') + first_valid_i = dataset.index.get_loc(info.valid_index_min) + last_valid_i = dataset.index.get_loc(info.valid_index_max) + + training_records = math.floor((last_valid_i - first_valid_i) * percent) + dataset['label'] = target['class'] + training_dataset = dataset.iloc[first_valid_i:first_valid_i+training_records] + # testing_dataset = dataset.iloc[first_valid_i+training_records: last_valid_i+1] + + from xgboost import XGBClassifier + from util.selection_pipeline import Pipeline + from sklearn.impute import SimpleImputer + from sklearn.preprocessing import StandardScaler + pipeline = Pipeline(steps=[ + ('s', StandardScaler()), + ('i', SimpleImputer()), + ('c', XGBClassifier(use_label_encoder=False)) + ]) + + X_train = training_dataset.drop(labels=['label'], axis='columns') + with pd.option_context('mode.use_inf_as_na', True): # Set option temporarily + X_train.fillna(axis='columns', method='ffill', inplace=True) + y_train = training_dataset['label'] + + from sklearn.feature_selection import SelectFromModel + sel = SelectFromModel(pipeline) + sel.fit(X_train, y_train) + support = sel.get_support() + + dinfo = info.to_dict() + for c, mask in zip(X_train.columns, support): + dinfo['features'][c] = True if mask else False + + import yaml + with open(target_name + '.info.yaml', 'w') as f: + yaml.dump(dinfo, f, sort_keys=False) + with open(target_name + '.info.yaml.bak', 'w') as f: + yaml.dump(info, f, sort_keys=False) + print('done') + + + +@app.command() +def test(symbol: str, currency: str): + from dataset import make_ohlcv_ta + from crawlers import kraken + _kraken = kraken.get_bootstrap_data(symbol, currency).fillna(method='ffill') + ohlcv = kraken.ticks_to_ohlcv(_kraken, '1D').fillna(method='ffill') + ohlcv_ta = make_ohlcv_ta(ohlcv) + print('It works') + print(ohlcv_ta.head()) + +if __name__ == '__main__': + app() \ No newline at end of file diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..06b9477 --- /dev/null +++ b/src/config.py @@ -0,0 +1,32 @@ +import confuse +from dotenv import load_dotenv +import re, os, yaml + +# Load Configuration +env_variable_pattern = re.compile('.*?\${(\w+)}.*?') +_configfile = None +load_dotenv() + +def replace_env_variables(loader, node): + """ + Extracts the environment variable from the node's value + :param yaml.Loader loader: the yaml loader + :param node: the current node in the yaml + :return: the parsed string that contains the value of the environment + variable + """ + value = loader.construct_scalar(node) + match = env_variable_pattern.findall(value) # to find all env variables in line + if match: + full_value = value + for g in match: + full_value = full_value.replace( + f'${{{g}}}', os.environ.get(g, g) + ) + return full_value if not full_value.isnumeric() else int(full_value) + return value + +confuse.Loader.add_constructor('!ENV', replace_env_variables) +#config = confuse.Configuration('CryptoML-API', __name__) +config = confuse.LazyConfig('CryptoML', __name__) +config.set_file('../config.yaml') diff --git a/src/crawlers/__init__.py b/src/crawlers/__init__.py new file mode 100644 index 0000000..b5ecb52 --- /dev/null +++ b/src/crawlers/__init__.py @@ -0,0 +1,90 @@ +import yaml +from util.bunch import Bunch + + +class Spec(dict): + def __init__(self, base_url, endpoints, **kwargs): + if not type(endpoints) is dict: + raise ValueError('Endpoints must be a dictionary!') + self.base_url = base_url + self.endpoints = endpoints + super().__init__(kwargs) + + @staticmethod + def from_dict(spec): + if not spec: + raise ValueError('Provided spec is invalid.') + if not 'base_url' in spec: + raise ValueError('Provided spec does not describe a base url.') + res = Spec(base_url=spec['base_url'], endpoints=spec['endpoints']) + return res + + def __dir__(self): + return self.endpoints.keys() + + def __setstate__(self, state): + pass + + def __setattr__(self, key, value): + if key in ['endpoints', 'base_url']: + self[key] = value + else: + self.endpoints[key] = value + + def __getattr__(self, key): + if key in ['endpoints', 'base_url']: + return self[key] + try: + query = self.endpoints[key] + if '{' in query and '}' in query: + return (self.base_url + query).format + return self.base_url + query + except KeyError: + raise AttributeError('Method not described in api spec: ' + key) + + +def load_api_spec(filename): + with open(filename, 'r') as f: + try: + spec = yaml.safe_load(f) + except yaml.YAMLError as exc: + print('error loading api spec' + exc) + if spec: + return Spec.from_dict(spec) + +def load_yaml(filename): + with open(filename, 'r') as f: + try: + spec = yaml.safe_load(f) + except yaml.YAMLError as exc: + print('error loading yaml' + exc) + if spec: + return Bunch(**spec) + +def save_yaml(filename, data): + with open(filename, 'w') as f: + try: + yaml.safe_dump(data=data, stream=f) + except yaml.YAMLError as exc: + print('error saving yaml' + exc.message or str(exc)) + +def load_bootstrap(zip_file, csv_file): + import zipfile + import pandas as pd + with zipfile.ZipFile(zip_file) as z: + with z.open(csv_file) as f: + train = pd.read_csv(f, delimiter=",", parse_dates=True, index_col='date') + return train + +def bootstrap_index(filename): + index = load_yaml(filename) + return index.bootstrap + +def load_transformer(filename): + import importlib + import ntpath + + spec = importlib.util.spec_from_file_location("bootstrap."+ntpath.basename(filename)[:-3], filename) + transformer = importlib.util.module_from_spec(spec) + spec.loader.exec_module(transformer) + return transformer \ No newline at end of file diff --git a/src/crawlers/coinmetrics-community.yaml b/src/crawlers/coinmetrics-community.yaml new file mode 100644 index 0000000..8310284 --- /dev/null +++ b/src/crawlers/coinmetrics-community.yaml @@ -0,0 +1,6 @@ +base_url: 'https://community-api.coinmetrics.io/v4' +api_key: '' +endpoints: + asset_metadata: '/catalog/assets?assets={assets}&pretty=false' + metrics_timeseries: '/timeseries/asset-metrics?assets={assets}&metrics={metrics}&frequency={frequency}&status=all&start_time={begin}&end_time={end}&end_inclusive=true&timezone=Europe/Rome' + diff --git a/src/crawlers/coinmetrics.py b/src/crawlers/coinmetrics.py new file mode 100644 index 0000000..1302117 --- /dev/null +++ b/src/crawlers/coinmetrics.py @@ -0,0 +1,62 @@ +def api(): + from . import load_api_spec + return load_api_spec('crawlers/coinmetrics-community.yaml') + +def get_assets(assets): + if type(assets) is list: + assets = ','.join(assets) + import requests + resp = requests.get(api().asset_metadata(assets=assets)) + return resp.json()['data'] + +def get_asset_features(asset_name, frequency): + assets = get_assets(asset_name) + result = [] + id = 1 + for asset in assets: + for metric in asset['metrics']: + for f in metric['frequencies']: + if frequency == f['frequency']: + result.append({'index': id, 'dataset': 'coinmetrics', 'asset': asset['asset'], 'name': metric['metric'], 'min': f['min_time'], 'max': f['max_time'], 'enabled': True}) + id += 1 + return result + +def get_asset_metrics(asset_name, metrics, frequency, begin, end): + if type(metrics) is list: + metrics = ','.join(metrics) + import requests + resp = requests.get(api().metrics_timeseries(assets=asset_name, metrics=metrics, frequency=frequency, begin=begin, end=end)) + rj = resp.json() + result = rj['data'] + import time + if 'next_page_url' in rj: + while True: + time.sleep(0.5) + resp = requests.get(rj['next_page_url']) + rj = resp.json() + result += rj['data'] + if not 'next_page_url' in rj: + break + return result + + +def get_bootstrap_data(symbol): + symbol = symbol.lower() + + from . import bootstrap_index, load_transformer + try: + index = bootstrap_index('../data/bootstrap/index.yaml') + transformer = load_transformer('../data/bootstrap/' + index.coinmetrics.transformer) + if symbol not in index.coinmetrics.groups: + filename = index.coinmetrics.name_format.format(symbol=symbol) + '.csv' + return transformer.get_df('../data/bootstrap/' + index.coinmetrics.zipfile, filename) + else: + filenames = [index.coinmetrics.name_format.format(symbol=symbol) + '.csv'] + filenames += [ name + '.csv' for name in index.coinmetrics.groups[symbol]] + dataframes = [transformer.get_df('../data/bootstrap/' + index.coinmetrics.zipfile, filename) for filename in filenames] + + import pandas as pd + return pd.concat(dataframes) + except Exception as e: + print('Exception occurred! ' + str(e)) + raise diff --git a/src/crawlers/kraken.py b/src/crawlers/kraken.py new file mode 100644 index 0000000..1ea7692 --- /dev/null +++ b/src/crawlers/kraken.py @@ -0,0 +1,51 @@ +def api(): + from . import load_api_spec + return load_api_spec('crawlers/kraken.yaml') + +def get_pair_ohlc(pair, metrics, frequency, begin, end): + global kraken + if type(metrics) is list: + metrics = ','.join(metrics) + import requests + resp = requests.get(api().ohlc_data(assets=pair, metrics=metrics, frequency=frequency, since=0)) + rj = resp.json() + result = rj['data'] + import time + if 'next_page_url' in rj: + while True: + time.sleep(0.5) + resp = requests.get(rj['next_page_url']) + rj = resp.json() + result += rj['data'] + if not 'next_page_url' in rj: + break + return result + + +def get_bootstrap_data(symbol, currency): + from . import bootstrap_index, load_transformer + _convert_map = { + 'btc':'xbt', + 'doge':'xdg' + } + if symbol in _convert_map: + symbol = _convert_map[symbol] + if currency in _convert_map: + currency = _convert_map[currency] + try: + index = bootstrap_index('../data/bootstrap/index.yaml') + transformer = load_transformer('../data/bootstrap/' + index.kraken.transformer) + if index.kraken.groups: + raise ValueError('Groups are not supported for kraken Loader') + filename = index.kraken.name_format.format(symbol=symbol.upper(), currency=currency.upper()) + '.csv' + return transformer.get_df('../data/bootstrap/' + index.kraken.zipfile, filename) + except Exception as e: + print('Exception occurred! ' + str(e)) + raise + + +def ticks_to_ohlcv(ticks, interval): + resample = ticks.resample(interval) + ohlc = resample['price'].ohlc() + ohlc['volume'] = resample['amount'].sum() + return ohlc \ No newline at end of file diff --git a/src/crawlers/kraken.yaml b/src/crawlers/kraken.yaml new file mode 100644 index 0000000..e677ae6 --- /dev/null +++ b/src/crawlers/kraken.yaml @@ -0,0 +1,6 @@ +base_url: 'https://api.kraken.com' +api_key: '' +endpoints: + asset_info: '/0/public/Assets?asset={assets}' # https://docs.kraken.com/rest/#operation/getAssetInfo + # OHLC Returns: [int