-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathconf_training_example_baskerville.yaml
53 lines (50 loc) · 3.88 KB
/
conf_training_example_baskerville.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
---
database:
name: baskerville # the database name
user: !ENV ${DB_USER}
password: !ENV '${DB_USER}'
type: 'postgres'
host: !ENV ${DB_USER}
port: !ENV ${DB_PORT}
maintenance: # Optional, for data partitioning and archiving
partition_table: 'request_sets' # default value
partition_by: week # partition by week or month, default value is week
partition_field: created_at # which field to use for the partitioning, this is the default value, can be omitted
strict: False # if False, then for the week partition the start and end date will be changed to the start and end of the respective weeks. If true, then the dates will remain unchanged. Be careful to be consistent with this.
data_partition: # Optional: Define the period to create partitions for
since: 2020-01-01 # when to start partitioning
until: "2020-12-31 23:59:59" # when to stop partitioning
index_by: # which fields to index in the partitions that will be created (only one index is supported currently), default value, can be omitted
- target
- ip
engine:
storage_path: !ENV '${BASKERVILLE_ROOT}/data/'
training: # Optional: only for the Training pipeline
model: 'baskerville.models.anomaly_model.AnomalyModel' # which model to use, see baskerville.util.enums.ModelEnum
data_parameters: # to define the training period, either use training_days or from/ to date
# training_days: 30 # today - training_days
from_date: '2020-03-01 20:59:59'
to_date: '2020-03-05 13:01:01'
model_parameters: # depending on the model chosen, provide the training parameters, as you would to instantiate the class
num_trees: 100 # number of trees
max_samples: 100 # number of samples per tree
categorical_features: ['target'] # the list of categorical features
# contamination: 0.1 # the target portion of anomalies in the dataset. Either use contamination or threshold
threshold: 0.45 # the threshold for anomalies in the dataset. Either use contamination or threshold
data_config:
schema: !ENV '${BASKERVILLE_ROOT}/data/samples/sample_log_schema.json'
logpath: !ENV '${BASKERVILLE_ROOT}/baskerville.log'
log_level: 'ERROR'
spark:
app_name: 'Baskerville' # the application name - can be changed for two different runs - used by the spark UI
master: 'local' # the ip:port of the master node, e.g. spark://someip:7077 to submit to a cluster
parallelism: -1 # controls the number of tasks, -1 means use all cores - used for local master
log_level: 'INFO' # spark logs level
storage_level: 'OFF_HEAP' # which strategy to use for storing dfs - valid values are the ones found here: https://spark.apache.org/docs/2.4.0/api/python/_modules/pyspark/storagelevel.html default: OFF_HEAP
jars: !ENV '${BASKERVILLE_ROOT}/data/jars/postgresql-42.2.4.jar,${BASKERVILLE_ROOT}/data/spark-iforest-2.4.0.jar' # or /path/to/jars/mysql-connector-java-8.0.11.jar
spark_driver_memory: '6G' # depends on your dataset and the available ram you have. If running locally 6 - 8 GB should be a good choice, depending on the amount of data you need to process
metrics_conf: !ENV '${BASKERVILLE_ROOT}/data/spark.metrics' # Optional: required only to export spark metrics
jar_packages: 'com.banzaicloud:spark-metrics_2.11:2.3-2.0.4,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2' # required to export spark metrics
jar_repositories: 'https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases' # Optional: Required only to export spark metrics
event_log: True
serializer: 'org.apache.spark.serializer.KryoSerializer'