fix DB once and for all. Make DBSCAN more efficient and robust.
This commit is contained in:
parent
a3799c4d5d
commit
a42a060ab5
@ -105,7 +105,7 @@ Mandatory parameters are marked as **Required**, which means that they are requi
|
|||||||
| `stratify_training_data` | This value is used to indicate the stratification of the data. e.g. 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. <br> **Datatype:** positive integer.
|
| `stratify_training_data` | This value is used to indicate the stratification of the data. e.g. 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. <br> **Datatype:** positive integer.
|
||||||
| `indicator_max_period_candles` | The maximum *period* used in `populate_any_indicators()` for indicator creation. FreqAI uses this information in combination with the maximum timeframe to calculate how many data points it should download so that the first data point does not have a NaN <br> **Datatype:** positive integer.
|
| `indicator_max_period_candles` | The maximum *period* used in `populate_any_indicators()` for indicator creation. FreqAI uses this information in combination with the maximum timeframe to calculate how many data points it should download so that the first data point does not have a NaN <br> **Datatype:** positive integer.
|
||||||
| `indicator_periods_candles` | A list of integers used to duplicate all indicators according to a set of periods and add them to the feature set. <br> **Datatype:** list of positive integers.
|
| `indicator_periods_candles` | A list of integers used to duplicate all indicators according to a set of periods and add them to the feature set. <br> **Datatype:** list of positive integers.
|
||||||
| `DBSCAN_outlier_pct` | Inactive by default. If user sets this to a fractional value, DBSCAN is used to cluster the training data and remove user set percentage of training data as outliers. <br> **Datatype:** float (fraction of 1).
|
| `use_DBSCAN_to_remove_outliers` | Inactive by default. If true, FreqAI clusters data using DBSCAN to identify and remove outliers from training and prediction data. <br> **Datatype:** float (fraction of 1).
|
||||||
| | **Data split parameters**
|
| | **Data split parameters**
|
||||||
| `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) <br> **Datatype:** dictionary.
|
| `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) <br> **Datatype:** dictionary.
|
||||||
| `test_size` | Fraction of data that should be used for testing instead of training. <br> **Datatype:** positive float below 1.
|
| `test_size` | Fraction of data that should be used for testing instead of training. <br> **Datatype:** positive float below 1.
|
||||||
@ -535,19 +535,12 @@ FreqAI will train an SVM on the training data (or components if the user activat
|
|||||||
|
|
||||||
### Clustering the training data and removing outliers with DBSCAN
|
### Clustering the training data and removing outliers with DBSCAN
|
||||||
|
|
||||||
The user can tell FreqAI to use DBSCAN to cluster training data and remove outliers from the training data set. The user set
|
The user can tell FreqAI to use DBSCAN to cluster training data and remove outliers from the training data set. The user activates `use_DBSCAN_to_remove_outliers` to cluster training data for identification of outliers. Also used to detect incoming outliers for prediction data points.
|
||||||
parameter `DBSCAN_outlier_pct` allows the user to indicate the percent of expected outliers to be removed during each training
|
|
||||||
(typically set below 0.05). Higher value increases confidence in the model predictions but reduces the entry frequency.
|
|
||||||
|
|
||||||
The FreqAI DBSCAN wrapper performs an iterative solution to solving the `eps` hyper parameter. `eps` controls the fraction of
|
|
||||||
training data considered to be an outlier - thus the iterative solution finds the exact value associated with the user designated
|
|
||||||
`DBSCAN_outlier_pct`. This iterative solution is performed once per training. FreqAI stores the `eps` to be used when DBSCAN
|
|
||||||
is again called to determine if incoming prediction candles are outliers.
|
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"freqai": {
|
"freqai": {
|
||||||
"feature_parameters" : {
|
"feature_parameters" : {
|
||||||
"DBSCAN_outlier_pct": 0.05
|
"use_DBSCAN_to_remove_outliers": true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
@ -21,6 +21,8 @@ from freqtrade.exceptions import OperationalException
|
|||||||
from freqtrade.resolvers import ExchangeResolver
|
from freqtrade.resolvers import ExchangeResolver
|
||||||
from freqtrade.strategy.interface import IStrategy
|
from freqtrade.strategy.interface import IStrategy
|
||||||
|
|
||||||
|
from sklearn.neighbors import NearestNeighbors
|
||||||
|
|
||||||
|
|
||||||
SECONDS_IN_DAY = 86400
|
SECONDS_IN_DAY = 86400
|
||||||
SECONDS_IN_HOUR = 3600
|
SECONDS_IN_HOUR = 3600
|
||||||
@ -91,7 +93,7 @@ class FreqaiDataKitchen:
|
|||||||
if self.live:
|
if self.live:
|
||||||
db_url = self.config.get('db_url', None)
|
db_url = self.config.get('db_url', None)
|
||||||
self.database_path = Path(db_url)
|
self.database_path = Path(db_url)
|
||||||
self.database_name = self.database_path.parts[-1]
|
self.database_name = Path(*self.database_path.parts[1:])
|
||||||
|
|
||||||
self.trade_database_df: DataFrame = pd.DataFrame()
|
self.trade_database_df: DataFrame = pd.DataFrame()
|
||||||
|
|
||||||
@ -606,7 +608,7 @@ class FreqaiDataKitchen:
|
|||||||
clustering = DBSCAN(
|
clustering = DBSCAN(
|
||||||
eps=self.data['DBSCAN_eps'],
|
eps=self.data['DBSCAN_eps'],
|
||||||
min_samples=self.data['DBSCAN_min_samples'],
|
min_samples=self.data['DBSCAN_min_samples'],
|
||||||
n_jobs=-1
|
n_jobs=self.thread_count
|
||||||
).fit(df)
|
).fit(df)
|
||||||
do_predict = np.where(clustering.labels_[-num_preds:] == -1, 0, 1)
|
do_predict = np.where(clustering.labels_[-num_preds:] == -1, 0, 1)
|
||||||
|
|
||||||
@ -618,33 +620,22 @@ class FreqaiDataKitchen:
|
|||||||
self.do_predict -= 1
|
self.do_predict -= 1
|
||||||
|
|
||||||
else:
|
else:
|
||||||
outlier_target = self.freqai_config['feature_parameters'].get('DBSCAN_outlier_pct')
|
|
||||||
if eps:
|
|
||||||
epsilon = eps
|
|
||||||
else:
|
|
||||||
epsilon = 10
|
|
||||||
logger.info('DBSCAN starting from high value. This should be faster next train.')
|
|
||||||
|
|
||||||
error = 1.
|
MinPts = len(self.data_dictionary['train_features'].columns)*2
|
||||||
MinPts = len(self.data_dictionary['train_features'].columns)
|
# measure pairwise distances to train_features.shape[1]*2 nearest neighbours
|
||||||
logger.info(
|
neighbors = NearestNeighbors(
|
||||||
f'DBSCAN finding best clustering for {outlier_target}% outliers.')
|
n_neighbors=MinPts, n_jobs=self.thread_count)
|
||||||
|
neighbors_fit = neighbors.fit(self.data_dictionary['train_features'])
|
||||||
|
distances, _ = neighbors_fit.kneighbors(self.data_dictionary['train_features'])
|
||||||
|
distances = np.sort(distances, axis=0)
|
||||||
|
index_ten_pct = int(len(distances[:, 1]) * 0.1)
|
||||||
|
distances = distances[index_ten_pct:, 1]
|
||||||
|
epsilon = distances[-1]
|
||||||
|
|
||||||
# find optimal value for epsilon using an iterative approach:
|
clustering = DBSCAN(eps=epsilon, min_samples=MinPts,
|
||||||
while abs(np.sqrt(error)) > 0.1:
|
n_jobs=int(self.thread_count)).fit(
|
||||||
clustering = DBSCAN(eps=epsilon, min_samples=MinPts,
|
self.data_dictionary['train_features']
|
||||||
n_jobs=int(self.thread_count / 2)).fit(
|
)
|
||||||
self.data_dictionary['train_features']
|
|
||||||
)
|
|
||||||
outlier_pct = np.count_nonzero(clustering.labels_ == -1) / len(clustering.labels_)
|
|
||||||
error = (outlier_pct - outlier_target) ** 2 / outlier_target
|
|
||||||
multiplier = (outlier_pct - outlier_target) if outlier_pct > 0 else 1 * \
|
|
||||||
np.sign(outlier_pct - outlier_target)
|
|
||||||
multiplier = 1 + error * multiplier
|
|
||||||
epsilon = multiplier * epsilon
|
|
||||||
logger.info(
|
|
||||||
f'DBSCAN error {error:.2f} for eps {epsilon:.2f}'
|
|
||||||
f' and outlier pct {outlier_pct:.2f}')
|
|
||||||
|
|
||||||
logger.info(f'DBSCAN found eps of {epsilon}.')
|
logger.info(f'DBSCAN found eps of {epsilon}.')
|
||||||
|
|
||||||
@ -939,6 +930,8 @@ class FreqaiDataKitchen:
|
|||||||
prepend=self.config.get("prepend_data", False),
|
prepend=self.config.get("prepend_data", False),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
exchange.close()
|
||||||
|
|
||||||
def set_all_pairs(self) -> None:
|
def set_all_pairs(self) -> None:
|
||||||
|
|
||||||
self.all_pairs = copy.deepcopy(
|
self.all_pairs = copy.deepcopy(
|
||||||
@ -1053,6 +1046,26 @@ class FreqaiDataKitchen:
|
|||||||
self.trade_database_df = df.dropna(subset='close_date')
|
self.trade_database_df = df.dropna(subset='close_date')
|
||||||
data.close()
|
data.close()
|
||||||
|
|
||||||
|
def fit_circle_2d(self, x, y, w=[]) -> float:
|
||||||
|
|
||||||
|
A = np.array([x, y, np.ones(len(x))]).T
|
||||||
|
b = x**2 + y**2
|
||||||
|
|
||||||
|
# Modify A,b for weighted least squares
|
||||||
|
if len(w) == len(x):
|
||||||
|
W = np.diag(w)
|
||||||
|
A = np.dot(W, A)
|
||||||
|
b = np.dot(W, b)
|
||||||
|
|
||||||
|
# Solve by method of least squares
|
||||||
|
c = np.linalg.lstsq(A, b, rcond=None)[0]
|
||||||
|
|
||||||
|
# Get circle parameters from solution c
|
||||||
|
xc = c[0] / 2
|
||||||
|
yc = c[1] / 2
|
||||||
|
r = np.sqrt(c[2] + xc**2 + yc**2)
|
||||||
|
return r
|
||||||
|
|
||||||
def np_encoder(self, object):
|
def np_encoder(self, object):
|
||||||
if isinstance(object, np.generic):
|
if isinstance(object, np.generic):
|
||||||
return object.item()
|
return object.item()
|
||||||
|
@ -384,7 +384,7 @@ class IFreqaiModel(ABC):
|
|||||||
if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
|
if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
|
||||||
dk.data["avg_mean_dist"] = dk.compute_distances()
|
dk.data["avg_mean_dist"] = dk.compute_distances()
|
||||||
|
|
||||||
if self.freqai_info["feature_parameters"].get("DBSCAN_outlier_pct", 0):
|
if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False):
|
||||||
if dk.pair in self.dd.old_DBSCAN_eps:
|
if dk.pair in self.dd.old_DBSCAN_eps:
|
||||||
eps = self.dd.old_DBSCAN_eps[dk.pair]
|
eps = self.dd.old_DBSCAN_eps[dk.pair]
|
||||||
else:
|
else:
|
||||||
@ -414,7 +414,7 @@ class IFreqaiModel(ABC):
|
|||||||
if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
|
if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
|
||||||
dk.check_if_pred_in_training_spaces()
|
dk.check_if_pred_in_training_spaces()
|
||||||
|
|
||||||
if self.freqai_info["feature_parameters"].get("DBSCAN_outlier_pct", 0):
|
if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False):
|
||||||
dk.use_DBSCAN_to_remove_outliers(predict=True)
|
dk.use_DBSCAN_to_remove_outliers(predict=True)
|
||||||
|
|
||||||
def model_exists(
|
def model_exists(
|
||||||
|
Loading…
Reference in New Issue
Block a user