From 2446574e3b72d83cf66328b1f64a5ab64f7eddfb Mon Sep 17 00:00:00 2001
From: Sylvain Haupert <sylvain.haupert@mnhn.fr>
Date: Wed, 30 Nov 2022 22:36:58 +0100
Subject: [PATCH] Update A_VIRER__TEST_full_process_test.py, birdnet_test.py,
 and 16 more files...

---
 article/A_VIRER__TEST_full_process_test.py    | 169 -----------------
 article/birdnet_test.py                       |   9 +-
 article/birdnet_train.py                      |  10 +-
 article/birdnet_validation.py                 |   8 +-
 article/clustering_only_test.py               |   8 +-
 article/clustering_only_train.py              |  10 +-
 article/clustering_only_validation.py         |   6 +-
 article/config_article.yaml                   |   4 +-
 article/full_process_test.py                  |   9 +-
 article/full_process_validation.py            |   7 +-
 bambird/__init__.py                           |  31 ++--
 bambird/cluster.py                            | 174 +++++++++++++++++-
 bambird/config.py                             |  98 ++++++++--
 bambird/dataset.py                            |   3 +-
 bambird/features.py                           |   5 +-
 bambird/segmentation.py                       |  16 +-
 bambird/segmentation_extract_rois_core.py     |   2 +-
 bambird/segmentation_extract_rois_full_sig.py |  25 ++-
 18 files changed, 327 insertions(+), 267 deletions(-)
 delete mode 100644 article/A_VIRER__TEST_full_process_test.py

diff --git a/article/A_VIRER__TEST_full_process_test.py b/article/A_VIRER__TEST_full_process_test.py
deleted file mode 100644
index 1c9fa5b..0000000
--- a/article/A_VIRER__TEST_full_process_test.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Updated 19 October 2022
-
-Authors : Felix Michaud and Sylvain Haupert
-
-"""
-
-from IPython import get_ipython
-print(__doc__)
-# Clear all the variables
-get_ipython().magic('reset -sf')
-
-import yaml
-from pathlib import Path
-import matplotlib.pyplot as plt
-plt.close("all")
-import pandas as pd
-
-import bambird
-
-# %%
-# Define constants
-# ----------------
-
-RANDOM_SEED = 1979
-
-# Choose the path to store the mp3
-DIR_DATA        = Path('../../temporary_data')
-# Choose the name of the dataset
-DATASET_NAME    = Path('A_VIRER_DATASET_PALEARTIC_PART2')
-# Choose the name of the ROIs dataset
-ROIS_NAME       = Path(str(DATASET_NAME) +'_ROIS_TEST')
-# Select the csv file with the metadata collected from xeno-canto (=> links to the mp3 to download)
-XC_CSV_FILE     = Path('./data/test') / 'xc_metadata.csv'
-# Select the annotation file corresponding to the ROIs dataset
-ANNOT_CSV_FILE  = Path('./data/test') / 'manual_annotations.csv'
-# Select the configuration file to segment, compute features and get the clusters
-CONFIG_FILE     = 'config_article.yaml' 
-
-# %%
-if __name__ == '__main__':
-
-    with open(CONFIG_FILE) as f:
-        params = yaml.load(f, Loader=bambird.get_loader())
-        
-    # load the inital dataset with the metadata stored from XC
-    df_dataset = pd.read_csv(XC_CSV_FILE, sep=';')
-    
-    #========================================
-    #========================================
-    df_dataset = df_dataset[0:10]
-    #========================================
-    #========================================
-                
-#%%
-
-    # Download audio Xeno-Canto
-    # -------------------------
-    
-    df_xc, csv_xc  = bambird.download_xc (
-                    df_dataset    = df_dataset,
-                    rootdir       = DIR_DATA,  
-                    dataset_name  = DATASET_NAME,
-                    overwrite     = True,
-                    verbose       = True
-                    )
- 
-    
-    
-#%%        
-    
-    # Extract ROIS
-    # -------------------------------    
-    
-    # extract ROIS
-    df_rois, csv_rois = bambird.multicpu_extract_rois(
-                    dataset             =df_xc,
-                    fun                 =params['PARAMS_EXTRACT']['FUNC'],
-                    params              =params['PARAMS_EXTRACT'],
-                    save_path           =DIR_DATA / ROIS_NAME,
-                    overwrite           =True,
-                    verbose             =True
-                    )
-                
-#%%           
- 
-    # process all the ROIS
-    # ---------------------------------------------------------------------
-
-    # compute features        
-    df_features, csv_features = bambird.multicpu_compute_features(
-                    dataset             =df_rois,
-                    params              =params['PARAMS_FEATURES'],
-                    save_path           =DIR_DATA / ROIS_NAME,
-                    overwrite           =True,
-                    verbose             =True
-                    )
-        
-#%%
-    # Cluster ROIS
-    # ---------------------------------------------------------------------
-
-    # with dataframe or csv file
-    df_cluster = bambird.find_cluster(
-                    dataset     =df_features,
-                    params      =params['PARAMS_CLUSTER'],
-                    display     =False,
-                    verbose     =True
-                    )
-
-    # Evaluation of the clustering : precision and recall
-    # ---------------------------------------------------------------------
-    
-    df_scores, p, r, f, markers = bambird.cluster_eval(
-                    df_cluster, 
-                    path_to_csv_with_gt     = ANNOT_CSV_FILE,
-                    colname_label_gt        ='manual_label',
-                    verbose                 =True
-                    )
-        
-#%%   
-    # Display the ROIS
-    # ---------------------------------------------------------------------
-    filename = bambird.overlay_rois(
-                    cluster     =df_cluster,
-                    params      =params['PARAMS_EXTRACT'],
-                    filename    =None,
-                    random_seed =None,
-                    verbose     =True
-                    )
-    
-    # Display the ROIS with TP=1 TN = 2 FN = 3 FP = 4
-    # ---------------------------------------------------------------------
-    
-    bambird.overlay_rois(
-                    cluster         =df_cluster,
-                    markers         =markers,
-                    params          =params['PARAMS_EXTRACT'],
-                    column_labels   ='marker',
-                    unique_labels   =['FP', 'TP', 'FN', 'TN'],
-                    filename        =filename,
-                    random_seed     =None,
-                    verbose         =True
-                    )
-        
-#%% 
-    # if 'mark_rois' in PROCESS :
-        
-    #     # Mark the ROIS with the prefix TN FN TP FP according to the clustering 
-    #     # ---------------------------------------------------------------------
-                
-    #     df_rois, flag = bambird.mark_rois(
-    #                     markers, 
-    #                     dataset_csv   =csv_rois,            
-    #                     verbose       =True
-    #                     )
-        
-#%%
-    # if 'unmark_rois' in PROCESS :
-        
-    #     # Unmark ROIS with the prefix TN FN TP FP  
-    #     # ---------------------------------------------------------------------
-                        
-    #     df_rois, flag = bambird.unmark_rois(
-    #                     dataset_csv =csv_rois,
-    #                     verbose     =True
-    #                     )
\ No newline at end of file
diff --git a/article/birdnet_test.py b/article/birdnet_test.py
index ab50c98..113299b 100644
--- a/article/birdnet_test.py
+++ b/article/birdnet_test.py
@@ -12,7 +12,6 @@
 # Clear all the variables
 get_ipython().magic('reset -sf')
 
-import yaml
 from pathlib import Path
 import pandas as pd
 import matplotlib.pyplot as plt
@@ -28,15 +27,9 @@
 DIR_DATA        = Path('./data/test')               
 ANNOT_CSV_FILE  = 'manual_annotations.csv' 
 BIRDNET_CSV_FILE= "birdnet_annotations.csv" 
-CONFIG_FILE     = 'config_article.yaml' 
 
 # %%
-if __name__ == '__main__':
-
-    with open(CONFIG_FILE) as f:
-        params = yaml.load(f, Loader=bambird.get_loader())
-    
-# %%         
+if __name__ == '__main__':       
          
     # BirdNET on ROIS 
     # ---------------------------------------------------------------------
diff --git a/article/birdnet_train.py b/article/birdnet_train.py
index 9ad274b..27a62ff 100644
--- a/article/birdnet_train.py
+++ b/article/birdnet_train.py
@@ -12,12 +12,12 @@
 # Clear all the variables
 get_ipython().magic('reset -sf')
 
-import yaml
 from pathlib import Path
 import pandas as pd
 import matplotlib.pyplot as plt
 plt.close("all")
 
+
 import bambird
 
 # %%
@@ -28,16 +28,10 @@
 DIR_DATA        = Path('./data/train')               
 ANNOT_CSV_FILE  = 'manual_annotations.csv' 
 BIRDNET_CSV_FILE= "birdnet_annotations.csv" 
-CONFIG_FILE     = 'config_article.yaml' 
 
 # %%
 if __name__ == '__main__':
-
-    with open(CONFIG_FILE) as f:
-        params = yaml.load(f, Loader=bambird.get_loader())
-        
-# %%         
-         
+       
     # BirdNET on ROIS 
     # ---------------------------------------------------------------------
     # Load the dataframe with the result from BirdNET
diff --git a/article/birdnet_validation.py b/article/birdnet_validation.py
index cf74a9d..0a559fd 100644
--- a/article/birdnet_validation.py
+++ b/article/birdnet_validation.py
@@ -28,16 +28,10 @@
 DIR_DATA        = Path('./data/validation')               
 ANNOT_CSV_FILE  = 'manual_annotations.csv' 
 BIRDNET_CSV_FILE= "birdnet_annotations.csv" 
-CONFIG_FILE     = 'config_article.yaml' 
 
 # %%
 if __name__ == '__main__':
-
-    with open(CONFIG_FILE) as f:
-        params = yaml.load(f, Loader=bambird.get_loader())
-    
-# %%         
-         
+   
     # BirdNET on ROIS 
     # ---------------------------------------------------------------------
     # Load the dataframe with the result from BirdNET
diff --git a/article/clustering_only_test.py b/article/clustering_only_test.py
index 75494c6..df487ac 100644
--- a/article/clustering_only_test.py
+++ b/article/clustering_only_test.py
@@ -12,13 +12,13 @@
 # Clear all the variables
 get_ipython().magic('reset -sf')
 
-import yaml
 from pathlib import Path
-import pandas as pd
 import matplotlib.pyplot as plt
 plt.close("all")
 
 import bambird
+import bambird.config as cfg
+
 # %%
 # Define constants
 # ----------------
@@ -32,8 +32,8 @@
 # %%
 if __name__ == '__main__':
 
-    with open(CONFIG_FILE) as f:
-        params = yaml.load(f, Loader=bambird.get_loader())
+    # Load the configuration file    
+    params = cfg.load_config(CONFIG_FILE)
     
     # Name of the csv file with feaetures
     FEATURES_CSV_FILE = (
diff --git a/article/clustering_only_train.py b/article/clustering_only_train.py
index 4219cd2..e083d13 100644
--- a/article/clustering_only_train.py
+++ b/article/clustering_only_train.py
@@ -17,10 +17,8 @@
 import matplotlib.pyplot as plt
 plt.close("all")
 
-bambird_path = Path('/home/haupert/DATA/mes_projets/Z_THESE_FELIX_MICHAUD/CHAPITRE_1/bambird.git')
-import os
-os.sys.path.append(bambird_path.as_posix())
 import bambird
+import bambird.config as cfg
 
 # %%
 # Define constants
@@ -34,9 +32,9 @@
 
 # %%
 if __name__ == '__main__':
-
-    with open(CONFIG_FILE) as f:
-        params = yaml.load(f, Loader=bambird.get_loader())
+    
+    # Load the configuration file    
+    params = cfg.load_config(CONFIG_FILE)
     
     # Name of the csv file with feaetures
     FEATURES_CSV_FILE = (
diff --git a/article/clustering_only_validation.py b/article/clustering_only_validation.py
index bf17e9c..070cbd0 100644
--- a/article/clustering_only_validation.py
+++ b/article/clustering_only_validation.py
@@ -18,6 +18,8 @@
 plt.close("all")
 
 import bambird
+import bambird.config as cfg
+
 # %%
 # Define constants
 # ----------------
@@ -31,8 +33,8 @@
 # %%
 if __name__ == '__main__':
 
-    with open(CONFIG_FILE) as f:
-        params = yaml.load(f, Loader=bambird.get_loader())
+    # Load the configuration file    
+    params = cfg.load_config(CONFIG_FILE)
     
     # Name of the csv file with feaetures
     FEATURES_CSV_FILE = (
diff --git a/article/config_article.yaml b/article/config_article.yaml
index d5a1ed6..d2d5c2a 100644
--- a/article/config_article.yaml
+++ b/article/config_article.yaml
@@ -17,14 +17,14 @@ PARAMS_XC:
   - len:30-60
   - q:">C"
   - type:song
-  #CSV_XC_FILE: 'dataset_xc_embcit.csv' # The file will contain all the metadata collected from Xeno-canto
+  CSV_XC_FILE: 'bam_dataset_xc.csv' # The file will contain all the metadata collected from Xeno-canto
  
 ################################
 # ROIS EXTRACTION PARAMETERS
 ################################
 PARAMS_EXTRACT:
   # function
-  FUNC: !fun bambird.extract_rois_core  # select the function used to extract the rois {bambird.extract_rois_core, bambird.extract_rois_full_sig}
+  FUNC: !FUNC bambird.extract_rois_core  # select the function used to extract the rois {bambird.extract_rois_core, bambird.extract_rois_full_sig}
   # Extract Audio resampling
   SAMPLE_RATE: 44100      # Sampling frequency in Hz
   # Audio preprocess
diff --git a/article/full_process_test.py b/article/full_process_test.py
index 49f4dd6..e5efccb 100644
--- a/article/full_process_test.py
+++ b/article/full_process_test.py
@@ -12,13 +12,13 @@
 # Clear all the variables
 get_ipython().magic('reset -sf')
 
-import yaml
 from pathlib import Path
 import matplotlib.pyplot as plt
 plt.close("all")
 import pandas as pd
 
 import bambird
+import bambird.config as cfg
 
 # %%
 # Define constants
@@ -42,8 +42,8 @@
 # %%
 if __name__ == '__main__':
 
-    with open(CONFIG_FILE) as f:
-        params = yaml.load(f, Loader=bambird.get_loader())
+    # Load the configuration file    
+    params = cfg.load_config(CONFIG_FILE)
         
     # load the inital dataset with the metadata stored from XC
     df_dataset = pd.read_csv(XC_CSV_FILE, sep=';')
@@ -69,7 +69,6 @@
     # extract ROIS
     df_rois, csv_rois = bambird.multicpu_extract_rois(
                     dataset             =df_xc,
-                    fun                 =params['PARAMS_EXTRACT']['FUNC'],
                     params              =params['PARAMS_EXTRACT'],
                     save_path           =DIR_DATA / ROIS_NAME,
                     overwrite           =True,
@@ -98,7 +97,7 @@
     df_cluster, csv_cluster = bambird.find_cluster(
                     dataset     =df_features,
                     params      =params['PARAMS_CLUSTER'],
-                    save_path           =DIR_DATA / ROIS_NAME,
+                    save_path   =DIR_DATA / ROIS_NAME,
                     display     =False,
                     verbose     =True
                     )
diff --git a/article/full_process_validation.py b/article/full_process_validation.py
index 3c3b89e..f74ea8d 100644
--- a/article/full_process_validation.py
+++ b/article/full_process_validation.py
@@ -12,13 +12,13 @@
 # Clear all the variables
 get_ipython().magic('reset -sf')
 
-import yaml
 from pathlib import Path
 import matplotlib.pyplot as plt
 plt.close("all")
 import pandas as pd
 
 import bambird
+import bambird.config as cfg
 
 # %%
 # Define constants
@@ -42,8 +42,8 @@
 # %%
 if __name__ == '__main__':
 
-    with open(CONFIG_FILE) as f:
-        params = yaml.load(f, Loader=bambird.get_loader())
+    # Load the configuration file    
+    params = cfg.load_config(CONFIG_FILE)
         
     # load the inital dataset with the metadata stored from XC
     df_dataset = pd.read_csv(XC_CSV_FILE, sep=';')
@@ -69,7 +69,6 @@
     # extract ROIS
     df_rois, csv_rois = bambird.multicpu_extract_rois(
                     dataset             =df_xc,
-                    fun                 =params['PARAMS_EXTRACT']['FUNC'],
                     params              =params['PARAMS_EXTRACT'],
                     save_path           =DIR_DATA / ROIS_NAME,
                     overwrite           =True,
diff --git a/bambird/__init__.py b/bambird/__init__.py
index 56843b7..ef99273 100644
--- a/bambird/__init__.py
+++ b/bambird/__init__.py
@@ -15,7 +15,7 @@
 -----------
 .. autosummary::
     :toctree: generated/
-    get_loader
+    load_config
 
 Dataset
 -------
@@ -39,7 +39,6 @@
 -------------
 .. autosummary::
     :toctree: generated/
-    
     compute_features
     multicpu_compute_features
     
@@ -47,8 +46,6 @@
 ---------------
 .. autosummary::
     :toctree: generated/
-    
-    prepare_features
     find_cluster
     cluster_eval
     overlay_rois
@@ -57,8 +54,20 @@
 
 """
 
+from .segmentation_extract_rois_full_sig import(       
+    extract_rois_full_sig,
+    )
+
+from .segmentation_extract_rois_core import(       
+    extract_rois_core,
+    )
+
+from .segmentation_extract_rois_in_soundscape import(       
+    extract_rois_in_soundscape,
+    )
+
 from .config import (
-    get_loader
+    load_config
     )
 
 from .dataset import(
@@ -73,13 +82,6 @@
     multicpu_extract_rois,
     )
 
-from .segmentation_extract_rois_full_sig import(       
-    extract_rois_full_sig,
-    )
-
-from .segmentation_extract_rois_core import(       
-    extract_rois_core,
-    )
 
 from .features import(
     compute_features,
@@ -87,7 +89,6 @@
     )
                  
 from .cluster import (
-    prepare_features,
     find_cluster,
     cluster_eval,
     overlay_rois,
@@ -98,7 +99,7 @@
 
 __all__ = [
         # config.py
-        'get_loader',
+        'load_config',
         # dataset.py
         'grab_audio_to_df',
         'change_path',
@@ -108,12 +109,12 @@
         'extract_rois_core',
         'extract_rois_full_sig',
         'single_file_extract_rois',
+        'extract_rois_in_soundscape',
         'multicpu_extract_rois',
         # features.py
         'compute_features',
         'multicpu_compute_features',
         # cluster.py
-        'prepare_features',
         'find_cluster',
         'cluster_eval',
         'overlay_rois',
diff --git a/bambird/cluster.py b/bambird/cluster.py
index a238d58..38ef0b4 100644
--- a/bambird/cluster.py
+++ b/bambird/cluster.py
@@ -49,6 +49,7 @@
 import maad
 
 from bambird import config as cfg
+# cfg.get_config()
 
 warnings.filterwarnings("ignore", module="librosa")
 warnings.filterwarnings("ignore", module="maad")
@@ -141,7 +142,7 @@ def _prepare_features(df_features,
 ###############################################################################
 def find_cluster(
         dataset,
-        params=cfg.DEFAULT_PARAMS_CLUSTER,
+        params=cfg.PARAMS['PARAMS_CLUSTER'],
         save_path=None,
         save_csv_filename=None,
         display=False,
@@ -509,12 +510,177 @@ def find_cluster(
 
     return df_cluster, csv_fullfilename
 
+
 ###############################################################################
 def cluster_eval(df_cluster,
                  path_to_csv_with_gt,
                  colname_label    = 'auto_label' ,
                  colname_label_gt = 'manual_label',
                  verbose=False):
+
+    fp_initial = []
+    tp_initial = []
+    precision_initial = []
+    precision = []
+    recall = []
+    tp = []
+    fp = []
+    tn = []
+    fn = []
+    number_rois_initial = []
+    number_rois_final = []
+    
+    df = df_cluster.copy()
+    
+    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    # HACK to DELETE in the future. For compliance with data of the article 
+    # The column categories does not exit
+    if ('categories' in df.columns) == False :
+        df["categories"] = df["species"]
+    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    
+    try : 
+        # load all annotations
+        df_labels = pd.read_csv(path_to_csv_with_gt, sep=';')
+        try :
+            df_labels.drop('species', axis=1, inplace=True)
+        except:
+            pass
+        try : 
+            df_labels.drop('code', axis=1, inplace=True)
+        except:
+            pass
+        df_labels.set_index('filename_ts', inplace=True)
+        df_labels.loc[df_labels[colname_label_gt] == '0', colname_label_gt] = 0
+        df_labels.loc[df_labels[colname_label_gt] == '1', colname_label_gt] = 1
+    
+        # join df_label and df then drop rows with NaN
+        if 'filename_ts' in df :
+            df.set_index('filename_ts', inplace=True)    
+        df = df.join(df_labels[colname_label_gt])
+        df = df.dropna(axis=0)
+        
+    except :
+        raise Exception("WARNING: path_to_csv_with_gt must be a valid path to a csv ")
+        
+    # Create a new column 'marker' with tp, tn, fp, fn
+    df['marker'] = None
+    #TP
+    df.loc[(df[colname_label]==1) * (df[colname_label_gt]==1), 'marker'] = 'TP'
+    #TN
+    df.loc[(df[colname_label]==0) * (df[colname_label_gt]==0), 'marker'] = 'TN'
+    #FP
+    df.loc[(df[colname_label]==1) * (df[colname_label_gt]==0), 'marker'] = 'FP'
+    #FN
+    df.loc[(df[colname_label]==0) * (df[colname_label_gt]==1), 'marker'] = 'FN'
+
+    # select Rois that belongs to the species depending on the clustering
+    
+    for categories in np.sort(df.categories.unique()):
+
+        number_rois_initial += [len(df[df["categories"] == categories])]
+        number_rois_final += [np.sum((df["categories"] == categories) & (df[colname_label] == 1))]
+
+        fp_initial += [np.sum(df[df["categories"] == categories][colname_label_gt] == 0)]
+        tp_initial += [np.sum( df[df["categories"] == categories][colname_label_gt] == 1)]
+
+        precision_initial += [round(tp_initial[-1] / (tp_initial[-1] + fp_initial[-1]) * 100)]
+
+        _tn, _fp, _fn, _tp = confusion_matrix(
+            df.dropna()[df["categories"] == categories][colname_label_gt].to_list(),
+            df.dropna()[df["categories"] ==  categories][colname_label].to_list()).ravel()
+        
+        tp += [_tp]
+        fp += [_fp]
+        tn += [_tn]
+        fn += [_fn]
+
+        if (_tp + _fp) > 0:
+            precision += [round(_tp / (_tp + _fp) * 100)]
+        else:
+            precision += [0]
+        if (_tp + _fn) > 0:
+            recall += [round(_tp / (_tp + _fn) * 100)]
+        else:
+            recall += [0]
+
+        if verbose:
+            print(
+                "Initial number of ROIs is {} / Final number of ROIs is {} => {}% reduction / noise {}% => {}%  / recall {}% ({})".format(
+                    number_rois_initial[-1],
+                    number_rois_final[-1],
+                    round(100 - number_rois_final[-1] /
+                          number_rois_initial[-1] * 100, 1),
+                    100 - precision_initial[-1],
+                    100 - precision[-1],
+                    recall[-1],
+                    categories,
+                )
+            )
+
+    # dataframe with scores
+    df_scores = pd.DataFrame(list(zip(np.sort(df.categories.unique()),
+                                      number_rois_initial,
+                                      number_rois_final,
+                                      tp_initial,
+                                      fp_initial,
+                                      tp,
+                                      fp,
+                                      tn,
+                                      fn,
+                                      precision_initial,
+                                      precision,
+                                      recall)),
+                             columns=['species',
+                                      'number_rois_initial',
+                                      'number_rois_final',
+                                      'tp_initial',
+                                      'fp_initial',
+                                      'tp',
+                                      'fp',
+                                      'tn',
+                                      'fn',
+                                      'precision_initial',
+                                      'precision',
+                                      'recall'])
+    # set species as index
+    df_scores.set_index('species', inplace = True)
+    
+    if verbose:
+        print("------------------------------------------------------")
+        print("------->Median initial noise {:.1f}%".format(
+            100-np.percentile(precision_initial, 50)))
+        print("Lower outlier Initial noise  {:.1f}%".format(
+            100-np.percentile(precision_initial, 95)))
+        print("Higher outlier Initial noise {:.1f}%".format(
+            100-np.percentile(precision_initial, 5)))
+        print("------->  Median Final noise {:.1f}%".format(
+            100-np.percentile(precision, 50)))
+        print("Lower outlier Final noise    {:.1f}%".format(
+            100-np.percentile(precision,95)))
+        print("Higher outlier Final noise   {:.1f}%".format(
+            100-np.percentile(precision,5)))
+        print("------------------------------------------------------")
+        # calculate the F1-SCORE (macro and micro)
+        y_true = (df.categories * df[colname_label_gt].apply(np.int64))
+        y_pred = (df.categories * df[colname_label].apply(np.int64))
+        print("******************************************************")
+        print("avg intial noise {:2.1f}% >>> avg final noise {:2.1f}%".format(100-np.mean(precision_initial),
+                                                                    100-np.mean(precision)))
+        p, r, f, _ = precision_recall_fscore_support( y_true, 
+                                                      y_pred, 
+                                                      average='macro')
+        print("MACRO precision {:.2f} | recall {:.2f} | F {:.2f}".format(p,r,f))
+        print("******************************************************")
+
+    return df_scores, p, r, f, df.marker
+
+###############################################################################
+def cluster_eval_(df_cluster,
+                 path_to_csv_with_gt,
+                 colname_label    = 'auto_label' ,
+                 colname_label_gt = 'manual_label',
+                 verbose=False):
     """
 
     Evalation of the clustering (requires annotations or any other files to 
@@ -571,11 +737,11 @@ def cluster_eval(df_cluster,
         # load all annotations
         df_labels = pd.read_csv(path_to_csv_with_gt, sep=';')
         try :
-            df_labels.drop('categories', axis=1, inplace=True)
+            df_labels.drop('species', axis=1, inplace=True)
         except:
             pass
         try : 
-            df_labels.drop('categories', axis=1, inplace=True)
+            df_labels.drop('code', axis=1, inplace=True)
         except:
             pass
         df_labels.set_index('filename_ts', inplace=True)
@@ -713,7 +879,7 @@ def overlay_rois (cluster,
                                  'tab:purple','tab:pink','tab:brown','tab:olive',
                                  'tab:cyan','tab:gray','yellow'],
                   textbox_label=True,
-                  params=cfg.DEFAULT_PARAMS_EXTRACT,
+                  params=cfg.PARAMS['PARAMS_EXTRACT'],
                   filename=None,
                   random_seed=None,
                   verbose=False,
diff --git a/bambird/config.py b/bambird/config.py
index b062c07..c9801e4 100644
--- a/bambird/config.py
+++ b/bambird/config.py
@@ -12,20 +12,25 @@
 #%%
 # general packages
 import sys
+import os
 
 # basic packages
 import yaml
 
+import bambird
+
 #%%
 
 RANDOM_SEED = 1979  # Fix the random seed to be able to repeat the results
 
-DEFAULT_PARAMS_XC = {
-    'PARAM_XC_LIST': ['len:"20-180"', 'q:">C"'],
-    'NUM_FILES': 10
+PARAMS_XC = {
+    'PARAM_XC_LIST': ['len:"20-60"', 'q:">C"', 'type:"song"'],
+    'NUM_FILES': 20,
+    'CSV_XC_FILE': 'xc_metadata.csv'
 }
 
-DEFAULT_PARAMS_EXTRACT = {
+PARAMS_EXTRACT = {
+    "FUNC" : bambird.extract_rois_full_sig,
     # Extract Audio resampling
     "SAMPLE_RATE": 48000,  # Sampling frequency in Hz
     # Audio preprocess
@@ -34,10 +39,10 @@
     # butterworth filter order to select the bandwidth corresponding to the ROI
     "BUTTER_ORDER": 1,
     # Max duration of the audio files that we will use to compute the features
-    "AUDIO_DURATION": 60,
+    "AUDIO_DURATION": 30,
     # Split the audio signal of chunk with duration = SIGNAL LENGTH (in second)
-    "SIGNAL_LENGTH": 10,
-    "OVLP": 0.5,  # Define the overlap ratio between each chunk
+    "CHUNK_DURATION": 10,
+    "OVLP": 0,  # Define the overlap ratio between each chunk
     # Spectrogram
     # Mode to compute the remove_background ('mean', 'median')
     "MODE_RMBCKG": "median",
@@ -61,7 +66,7 @@
     "FILTER_ORDER": 5,
 }
 
-DEFAULT_PARAMS_FEATURES = {
+PARAMS_FEATURES = {
     # Extract Audio resampling
     "SAMPLE_RATE": 48000,  # Sampling frequency in Hz
     # Audio preprocess
@@ -75,14 +80,22 @@
     "SHAPE_RES": "high",
 }
 
-DEFAULT_PARAMS_CLUSTER = {
-    "PERCENTAGE_PTS": 5,       # in %
-    "METHOD": "DBSCAN",        # HDBSCAN or DBSCAN
-    "SCALER": "MINMAXSCALER", # STANDARDSCALER or ROBUSTSCALER or MINMAXSCALER
-    "KEEP":   "BIGGEST",         # ALL or BIGGEST
-    "EPS":    "auto"            # set the maximum distance between elements in a single clusters {a number or 'auto'}
+PARAMS_CLUSTER = {
+    "FEATURES": ['shp', 'centroid_f'],  # choose the features used to cluster {'shp', 'centroid_f', 'peak_f', 'duration_t', 'bandwidth_f', 'bandwidth_min_f', 'bandwidth_max_f', 'min_f', 'max_f' } 
+    "PERCENTAGE_PTS": 5,                 # minimum number of ROIs to form a cluster (in % of the total number of ROIs) {number between 0 and 1 or blank}
+    "MIN_PTS": None,                     # minimum number of ROIs to form a cluster {integer or blank}
+    "METHOD": "DBSCAN",                 # HDBSCAN or DBSCAN
+    "SCALER": "MINMAXSCALER",           # STANDARDSCALER or ROBUSTSCALER or MINMAXSCALER
+    "KEEP":   "BIGGEST",                # ALL or BIGGEST
+    "EPS":    "auto"                    # set the maximum distance between elements in a single clusters {a number or 'auto'}
 }
 
+PARAMS = {
+    'PARAMS_XC' : PARAMS_XC,
+    'PARAMS_EXTRACT' : PARAMS_EXTRACT,
+    'PARAMS_FEATURES' : PARAMS_FEATURES,
+    'PARAMS_CLUSTER' : PARAMS_CLUSTER
+    }
 
 #%%
 
@@ -113,14 +126,61 @@ def _fun_constructor(loader, node):
     print(val)
     return _fun_call_by_name(val)
 
+def _get_loader():
+    """Add constructors to PyYAML loader."""
+    loader = yaml.SafeLoader
+    loader.add_constructor("!FUNC", _fun_constructor)
+    return loader
+
 """ ===========================================================================
 
                     Public function 
 
 ============================================================================"""
 
-def get_loader():
-  """Add constructors to PyYAML loader."""
-  loader = yaml.SafeLoader
-  loader.add_constructor("!fun", _fun_constructor)
-  return loader
+def load_config(fullfilename = None):
+    """
+    Load the configuration file to set all the parameters of bambird
+
+    Parameters
+    ----------
+    fullfilename : string, optional
+        Path to the configuration file.
+        if no valid configuration file is given, the parameters are set to the
+        default values.
+
+    Returns
+    -------
+    PARAMS : dictionary
+        Dictionary with all the parameters that are required for the bambird's
+        functions
+    """    
+    
+    global PARAMS  
+    global PARAMS_XC
+    global PARAMS_EXTRACT
+    global PARAMS_FEATURES
+    global PARAMS_CLUSTER
+    
+    if os.path.isfile(str(fullfilename)): 
+        with open(fullfilename) as f:
+            PARAMS = yaml.load(f, Loader=_get_loader())
+            PARAMS_XC = PARAMS['PARAMS_XC']
+            PARAMS_EXTRACT = PARAMS['PARAMS_EXTRACT']
+            PARAMS_FEATURES = PARAMS['PARAMS_FEATURES']
+            PARAMS_CLUSTER = PARAMS['PARAMS_CLUSTER']
+    else :
+        print("The config file {} could not be loaded. Default parameters are loaded".format(fullfilename))
+        
+    return PARAMS
+
+def get_config() :
+    PARAMS = {
+        'PARAMS_XC' : PARAMS_XC,
+        'PARAMS_EXTRACT' : PARAMS_EXTRACT,
+        'PARAMS_FEATURES' : PARAMS_FEATURES,
+        'PARAMS_CLUSTER' : PARAMS_CLUSTER
+        }
+    return PARAMS
+
+
diff --git a/bambird/dataset.py b/bambird/dataset.py
index bf27a21..7dc49d7 100644
--- a/bambird/dataset.py
+++ b/bambird/dataset.py
@@ -24,12 +24,13 @@
 
 #
 from bambird import config as cfg
+# cfg.get_config()
 
 #%%
 
 ###############################################################################
 def query_xc (species_list, 
-           params=cfg.DEFAULT_PARAMS_XC,
+           params=cfg.PARAMS['PARAMS_XC'],
            format_time=False,
            format_date=False,
            random_seed=cfg.RANDOM_SEED, 
diff --git a/bambird/features.py b/bambird/features.py
index 71f3d26..d99048a 100644
--- a/bambird/features.py
+++ b/bambird/features.py
@@ -33,6 +33,7 @@
 
 #
 from bambird import config as cfg
+# cfg.get_config()
 from bambird import grab_audio_to_df
 
 
@@ -41,7 +42,7 @@
 ###############################################################################
 def compute_features(
     audio_path,
-    params=cfg.DEFAULT_PARAMS_FEATURES,
+    params=cfg.PARAMS['PARAMS_FEATURES'],
     display=False,
     verbose=False):
     """ 
@@ -243,7 +244,7 @@ def compute_features(
 ###############################################################################
 def multicpu_compute_features(
                 dataset, 
-                params=cfg.DEFAULT_PARAMS_FEATURES,
+                params=cfg.PARAMS['PARAMS_FEATURES'],
                 save_path=None,
                 save_csv_filename=None,
                 nb_cpu=None,
diff --git a/bambird/segmentation.py b/bambird/segmentation.py
index 97ccdb4..cf5fbb9 100644
--- a/bambird/segmentation.py
+++ b/bambird/segmentation.py
@@ -166,7 +166,7 @@ def _save_rois(
 def single_file_extract_rois(
     audio_path,
     fun,
-    params=cfg.DEFAULT_PARAMS_EXTRACT,
+    params=cfg.PARAMS['PARAMS_EXTRACT'],
     save_path=None,
     display=False,
     verbose=False):
@@ -316,8 +316,7 @@ def single_file_extract_rois(
 ###############################################################################
 def multicpu_extract_rois(
     dataset, 
-    fun,
-    params=cfg.DEFAULT_PARAMS_EXTRACT,
+    params=cfg.PARAMS['PARAMS_EXTRACT'],
     save_path=None,
     save_csv_filename='rois.csv',
     overwrite=False,
@@ -337,8 +336,6 @@ def multicpu_extract_rois(
         "filename" and a column "fullfilename" with the full path to the audio
         files to process. This dataframe can be obtained by called the function
         grab_audio_to_df        
-    fun : function
-        name of the function that is called to segment the rois
     params : dictionnary, optioanl
         contains all the parameters to extract the rois 
     save_path : string, default is None
@@ -467,10 +464,15 @@ def multicpu_extract_rois(
                 nb_cpu = os.cpu_count()
                 
             # define a new function with fixed parameters to give to the multicpu pool 
-            #-------------------------------------------------------------------------
+            #-------------------------------------------------------------------------        
+            
+            # Print the characteristics of the function used to segment the files
+            if verbose :
+                print(params['FUNC'])
+            
             multicpu_func = partial(
                 single_file_extract_rois,
-                fun=fun,
+                fun=params['FUNC'],
                 params=params,
                 save_path=save_path,
                 display=False,
diff --git a/bambird/segmentation_extract_rois_core.py b/bambird/segmentation_extract_rois_core.py
index e8f26eb..5342dcf 100644
--- a/bambird/segmentation_extract_rois_core.py
+++ b/bambird/segmentation_extract_rois_core.py
@@ -162,7 +162,7 @@ def _merge_bbox(df_rois, margins):
 ###############################################################################
 def extract_rois_core(
     sig,
-    params=cfg.DEFAULT_PARAMS_EXTRACT,
+    params=cfg.PARAMS_EXTRACT,
     display=False,
     verbose=False,
     **kwargs):
diff --git a/bambird/segmentation_extract_rois_full_sig.py b/bambird/segmentation_extract_rois_full_sig.py
index 8fec607..f0205c1 100644
--- a/bambird/segmentation_extract_rois_full_sig.py
+++ b/bambird/segmentation_extract_rois_full_sig.py
@@ -24,8 +24,27 @@
 # Scikit-Maad (ecoacoustics functions) package
 import maad
 
-# import bamx
-from bambird import config as cfg
+PARAMS_EXTRACT = {'SAMPLE_RATE': 48000,
+                 'LOW_FREQ': 250,
+                 'HIGH_FREQ': 12000,
+                 'BUTTER_ORDER': 1,
+                 'AUDIO_DURATION': 30,
+                 'CHUNK_DURATION': 10,
+                 'OVLP': 0,
+                 'MODE_RMBCKG': 'median',
+                 'N_RUNNING_MEAN': 10,
+                 'NFFT': 1024,
+                 'MASK_PARAM1': 26,
+                 'MASK_PARAM2': 10,
+                 'MAX_RATIO_YX': 7,
+                 'MIN_DURATION': 0.1,
+                 'MARGIN_T_LEFT': 0.2,
+                 'MARGIN_T_RIGHT': 0.2,
+                 'MARGIN_F_TOP': 250,
+                 'MARGIN_F_BOTTOM': 250,
+                 'MARGIN_T': 0.1,
+                 'MARGIN_F': 250,
+                 'FILTER_ORDER': 5}
 
 #%%
 
@@ -235,7 +254,7 @@ def _select_rois(im_bin,
 ###############################################################################
 def extract_rois_full_sig(
     sig,
-    params=cfg.DEFAULT_PARAMS_EXTRACT,
+    params=PARAMS_EXTRACT,
     display=False,
     verbose=False,
     **kwargs):