remove trade database analyzer, clean up a bit

This commit is contained in:
robcaulk 2022-08-10 17:43:06 +02:00
parent 91d0c91287
commit 2cae3c42e6
3 changed files with 1 additions and 287 deletions

View File

@ -663,79 +663,13 @@ The user needs to set the standard dictionary in the config so FreqAI can return
These values will likely be overridden by the user prediction model, but in the case where the user model has yet to set them, or needs
a default initial value - this is the value that will be returned.
## Analyzing the trade live database
Users can analyze the live trade database by calling `analyze_trade_database()` in their custom prediction model. FreqAI already has the
database setup in a pandas dataframe and ready to be analyzed. Here is an example usecase:
```python
def analyze_trade_database(self, dk: FreqaiDataKitchen, pair: str) -> None:
"""
User analyzes the trade database here and returns summary stats which will be passed back
to the strategy for reinforcement learning or for additional adaptive metrics for use
in entry/exit signals. Store these metrics in dk.data['extra_returns_per_train'] and
they will format themselves into the dataframe as an additional column in the user
strategy. User has access to the current trade database in dk.trade_database_df.
"""
total_profit = dk.trade_database_df['close_profit_abs'].sum()
dk.data['extra_returns_per_train']['total_profit'] = total_profit
return
```
## Building an IFreqaiModel
FreqAI has multiple example prediction model based libraries such as `Catboost` regression (`freqai/prediction_models/CatboostRegressor.py`) and `LightGBM` regression.
However, users can customize and create their own prediction models using the `IFreqaiModel` class.
Users are encouraged to inherit `train()` and `predict()` to let them customize various aspects of their training procedures.
<!-- ## Dynamic target expectation
The labels used for model training have a unique statistical distribution for each separate model training.
We can use this information to know if our current prediction is in the realm of what the model was trained on,
and if so, what is the statistical probability of the current prediction. With this information, we can
make more informed prediction.
FreqAI builds this label distribution and provides a quantile to the strategy, which can be optionally used as a
dynamic threshold. The `target_quantile: X` means that X% of the labels are below this value. So setting:
```json
"freqai": {
"feature_parameters" : {
"target_quantile": 0.9
}
}
```
Means the user will get back in the strategy the label threshold at which 90% of the labels were
below this value. An example usage in the strategy may look something like:
```python
def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
# ... #
(
dataframe["prediction"],
dataframe["do_predict"],
dataframe["target_upper_quantile"],
dataframe["target_lower_quantile"],
) = self.freqai.start(dataframe, metadata, self)
return dataframe
def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
buy_conditions = [
(dataframe["prediction"] > dataframe["target_upper_quantile"]) & (dataframe["do_predict"] == 1)
]
if buy_conditions:
dataframe.loc[reduce(lambda x, y: x | y, buy_conditions), "buy"] = 1
return dataframe
``` -->
## Additional information
### Common pitfalls

View File

@ -2,9 +2,8 @@ import copy
import datetime
import logging
import shutil
import sqlite3
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Tuple
import numpy as np
import numpy.typing as npt
@ -88,20 +87,6 @@ class FreqaiDataKitchen:
config["freqai"]["backtest_period_days"],
)
self.database_path: Optional[Path] = None
if self.live:
db_url = self.config.get('db_url', None)
self.database_path = Path(db_url)
if 'sqlite' not in self.database_path.parts[0]:
self.database_path = None
logger.warning('FreqAI database analyzer only available for sqlite dbs. '
' FreqAI will still run, but user cannot use database analyzer.')
else:
self.database_name = Path(*self.database_path.parts[1:])
self.trade_database_df: DataFrame = pd.DataFrame()
self.data['extra_returns_per_train'] = self.freqai_config.get('extra_returns_per_train', {})
self.thread_count = self.freqai_config.get("data_kitchen_thread_count", -1)
self.train_dates: DataFrame = pd.DataFrame()
@ -1007,13 +992,6 @@ class FreqaiDataKitchen:
f = spy.stats.norm.fit(self.data_dictionary["train_labels"][label])
self.data["labels_mean"][label], self.data["labels_std"][label] = f[0], f[1]
# KEEPME incase we want to let user start to grab quantiles.
# upper_q = spy.stats.norm.ppf(self.freqai_config['feature_parameters'][
# 'target_quantile'], *f)
# lower_q = spy.stats.norm.ppf(1 - self.freqai_config['feature_parameters'][
# 'target_quantile'], *f)
# self.data["upper_quantile"] = upper_q
# self.data["lower_quantile"] = lower_q
return
def remove_features_from_df(self, dataframe: DataFrame) -> DataFrame:
@ -1025,181 +1003,3 @@ class FreqaiDataKitchen:
col for col in dataframe.columns if not col.startswith("%") or col.startswith("%%")
]
return dataframe[to_keep]
def get_current_trade_database(self) -> None:
if self.database_path is None:
logger.warning('No trade database found. Skipping analysis.')
return
data = sqlite3.connect(self.database_name)
query = data.execute("SELECT * From trades")
cols = [column[0] for column in query.description]
df = pd.DataFrame.from_records(data=query.fetchall(), columns=cols)
self.trade_database_df = df.dropna(subset='close_date')
data.close()
def np_encoder(self, object):
if isinstance(object, np.generic):
return object.item()
# Functions containing useful data manipulation examples. but not actively in use.
# Possibly phasing these outlier removal methods below out in favor of
# use_SVM_to_remove_outliers (computationally more efficient and apparently higher performance).
# But these have good data manipulation examples, so keep them commented here for now.
# def determine_statistical_distributions(self) -> None:
# from fitter import Fitter
# logger.info('Determining best model for all features, may take some time')
# def compute_quantiles(ft):
# f = Fitter(self.data_dictionary["train_features"][ft],
# distributions=['gamma', 'cauchy', 'laplace',
# 'beta', 'uniform', 'lognorm'])
# f.fit()
# # f.summary()
# dist = list(f.get_best().items())[0][0]
# params = f.get_best()[dist]
# upper_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.999, **params)
# lower_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.001, **params)
# return ft, upper_q, lower_q, dist
# quantiles_tuple = Parallel(n_jobs=-1)(
# delayed(compute_quantiles)(ft) for ft in self.data_dictionary[
# 'train_features'].columns)
# df = pd.DataFrame(quantiles_tuple, columns=['features', 'upper_quantiles',
# 'lower_quantiles', 'dist'])
# self.data_dictionary['upper_quantiles'] = df['upper_quantiles']
# self.data_dictionary['lower_quantiles'] = df['lower_quantiles']
# return
# def remove_outliers(self, predict: bool) -> None:
# """
# Remove data that looks like an outlier based on the distribution of each
# variable.
# :params:
# :predict: boolean which tells the function if this is prediction data or
# training data coming in.
# """
# lower_quantile = self.data_dictionary["lower_quantiles"].to_numpy()
# upper_quantile = self.data_dictionary["upper_quantiles"].to_numpy()
# if predict:
# df = self.data_dictionary["prediction_features"][
# (self.data_dictionary["prediction_features"] < upper_quantile)
# & (self.data_dictionary["prediction_features"] > lower_quantile)
# ]
# drop_index = pd.isnull(df).any(1)
# self.data_dictionary["prediction_features"].fillna(0, inplace=True)
# drop_index = ~drop_index
# do_predict = np.array(drop_index.replace(True, 1).replace(False, 0))
# logger.info(
# "remove_outliers() tossed %s predictions",
# len(do_predict) - do_predict.sum(),
# )
# self.do_predict += do_predict
# self.do_predict -= 1
# else:
# filter_train_df = self.data_dictionary["train_features"][
# (self.data_dictionary["train_features"] < upper_quantile)
# & (self.data_dictionary["train_features"] > lower_quantile)
# ]
# drop_index = pd.isnull(filter_train_df).any(1)
# drop_index = drop_index.replace(True, 1).replace(False, 0)
# self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
# (drop_index == 0)
# ]
# self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
# (drop_index == 0)
# ]
# self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
# (drop_index == 0)
# ]
# logger.info(
# f'remove_outliers() tossed {drop_index.sum()}'
# f' training points from {len(filter_train_df)}'
# )
# # do the same for the test data
# filter_test_df = self.data_dictionary["test_features"][
# (self.data_dictionary["test_features"] < upper_quantile)
# & (self.data_dictionary["test_features"] > lower_quantile)
# ]
# drop_index = pd.isnull(filter_test_df).any(1)
# drop_index = drop_index.replace(True, 1).replace(False, 0)
# self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][
# (drop_index == 0)
# ]
# self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
# (drop_index == 0)
# ]
# self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
# (drop_index == 0)
# ]
# logger.info(
# f'remove_outliers() tossed {drop_index.sum()}'
# f' test points from {len(filter_test_df)}'
# )
# return
# def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
# """
# standardize all data in the data_dictionary according to the training dataset
# :params:
# :data_dictionary: dictionary containing the cleaned and split training/test data/labels
# :returns:
# :data_dictionary: updated dictionary with standardized values.
# """
# # standardize the data by training stats
# train_mean = data_dictionary["train_features"].mean()
# train_std = data_dictionary["train_features"].std()
# data_dictionary["train_features"] = (
# data_dictionary["train_features"] - train_mean
# ) / train_std
# data_dictionary["test_features"] = (
# data_dictionary["test_features"] - train_mean
# ) / train_std
# train_labels_std = data_dictionary["train_labels"].std()
# train_labels_mean = data_dictionary["train_labels"].mean()
# data_dictionary["train_labels"] = (
# data_dictionary["train_labels"] - train_labels_mean
# ) / train_labels_std
# data_dictionary["test_labels"] = (
# data_dictionary["test_labels"] - train_labels_mean
# ) / train_labels_std
# for item in train_std.keys():
# self.data[item + "_std"] = train_std[item]
# self.data[item + "_mean"] = train_mean[item]
# self.data["labels_std"] = train_labels_std
# self.data["labels_mean"] = train_labels_mean
# return data_dictionary
# def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame:
# """
# Normalizes a set of data using the mean and standard deviation from
# the associated training data.
# :params:
# :df: Dataframe to be standardized
# """
# for item in df.keys():
# df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"]
# return df

View File

@ -491,9 +491,6 @@ class IFreqaiModel(ABC):
model = self.train(unfiltered_dataframe, pair, dk)
dk.get_current_trade_database()
self.analyze_trade_database(dk, pair)
self.dd.pair_dict[pair]["trained_timestamp"] = new_trained_timerange.stopts
dk.set_new_model_names(pair, new_trained_timerange)
self.dd.pair_dict[pair]["first"] = False
@ -612,20 +609,3 @@ class IFreqaiModel(ABC):
:do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
data (NaNs) or felt uncertain about data (i.e. SVM and/or DI index)
"""
def analyze_trade_database(self, dk: FreqaiDataKitchen, pair: str) -> None:
"""
User analyzes the trade database here and returns summary stats which will be passed back
to the strategy for reinforcement learning or for additional adaptive metrics for use
in entry/exit signals. Store these metrics in dk.data['extra_returns_per_train'] and
they will format themselves into the dataframe as an additional column in the user
strategy. User has access to the current trade database in dk.trade_database_df.
"""
# if dk.trade_database_df.empty:
# logger.warning(f'No trades found for {pair} to analyze DB')
# return
# total_profit = dk.trade_database_df['close_profit_abs'].sum()
# dk.data['extra_returns_per_train']['total_profit'] = total_profit
return