From a288e399e6b0beec62729bd7d46b98a55de5ab79 Mon Sep 17 00:00:00 2001
From: XianBW <36835909+XianBW@users.noreply.github.com>
Date: Thu, 26 Sep 2024 09:08:25 +0800
Subject: [PATCH] feat: add kaggle tpl: feedback-prize (#331)

* change feedback tpl

* feedback tpl changes

* fix feedback tpl

* fix train.py of feedback tpl

* add rf model for feedback tpl

* fix CI
---
 .../fea_share_preprocess.py                   | 216 +++---------------
 .../feature/feature.py                        |  27 ++-
 .../model/model.py                            |  18 --
 .../model/model_randomforest.py               |  40 ++++
 .../model/model_xgboost.py                    |  43 ++++
 .../train.py                                  |  91 ++++++--
 .../meta_tpl/cross_validation_tpl.py          |  87 -------
 7 files changed, 211 insertions(+), 311 deletions(-)
 delete mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py
 create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py
 create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py
 delete mode 100644 rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py

diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
index 43b4beb4..6222a236 100644
--- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
@@ -1,198 +1,48 @@
-# TODO: Fix
+import os
 import re
 
 import numpy as np  # linear algebra
 import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
 
-train = pd.read_csv("/kaggle/input/train.csv")
-test = pd.read_csv("/kaggle/input/test.csv")
-submission = pd.read_csv("/kaggle/input/sample_submission.csv")
 
+def preprocess_script():
+    """
+    This method applies the preprocessing steps to the training, validation, and test datasets.
+    """
+    if os.path.exists("/kaggle/input/X_train.pkl"):
+        X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
+        X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
+        y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
+        y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
+        X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
+        others = pd.read_pickle("/kaggle/input/others.pkl")
 
-features = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
-target = train[features]
+        return X_train, X_valid, y_train, y_valid, X_test, *others
 
+    def data_cleaner(text):
+        text = text.strip()
+        text = re.sub(r"\n", "", text)
+        text = text.lower()
+        return text
 
-text_train = train["full_text"]
-text_test = test["full_text"]
+    # train
+    train = pd.read_csv("/kaggle/input/train.csv")
+    test = pd.read_csv("/kaggle/input/test.csv")
 
-text = pd.concat([text_train, text_test], ignore_index=True)
+    train["full_text"] = train["full_text"].apply(data_cleaner)
+    test["full_text"] = test["full_text"].apply(data_cleaner)
 
+    y_train = train[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]
 
-count_words = text.str.findall(r"(\w+)").str.len()
-print(count_words.sum())
+    vectorizer = TfidfVectorizer()
+    X_train = vectorizer.fit_transform(train["full_text"])
+    X_test = vectorizer.transform(test["full_text"])
 
+    X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
+    X_test = pd.DataFrame.sparse.from_spmatrix(X_test)
 
-""" Cleaning Text """
-text = text.str.lower()
+    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
 
-# removing special characters and numbers
-text = text.apply(lambda x: re.sub("[^a-z]\s", "", x))
-
-# remove hash tags
-text = text.str.replace("#", "")
-
-# remove words less than 3 character and greater than 7
-text = text.apply(lambda x: " ".join([w for w in x.split() if len(w) > 2 and len(w) < 8]))
-
-# removing stopwords
-# text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))
-
-count_words = text.str.findall(r"(\w+)").str.len()
-print(count_words.sum())
-
-
-most_freq_words = pd.Series(" ".join(text).lower().split()).value_counts()[:25]
-text = text.apply(lambda x: " ".join(word for word in x.split() if word not in most_freq_words))
-
-count_words = text.str.findall(r"(\w+)").str.len()
-
-apostrophe_dict = {
-    "ain't": "am not / are not",
-    "aren't": "are not / am not",
-    "can't": "cannot",
-    "can't've": "cannot have",
-    "'cause": "because",
-    "could've": "could have",
-    "couldn't": "could not",
-    "couldn't've": "could not have",
-    "didn't": "did not",
-    "doesn't": "does not",
-    "don't": "do not",
-    "hadn't": "had not",
-    "hadn't've": "had not have",
-    "hasn't": "has not",
-    "haven't": "have not",
-    "he'd": "he had / he would",
-    "he'd've": "he would have",
-    "he'll": "he shall / he will",
-    "he'll've": "he shall have / he will have",
-    "he's": "he has / he is",
-    "how'd": "how did",
-    "how'd'y": "how do you",
-    "how'll": "how will",
-    "how's": "how has / how is",
-    "i'd": "I had / I would",
-    "i'd've": "I would have",
-    "i'll": "I shall / I will",
-    "i'll've": "I shall have / I will have",
-    "i'm": "I am",
-    "i've": "I have",
-    "isn't": "is not",
-    "it'd": "it had / it would",
-    "it'd've": "it would have",
-    "it'll": "it shall / it will",
-    "it'll've": "it shall have / it will have",
-    "it's": "it has / it is",
-    "let's": "let us",
-    "ma'am": "madam",
-    "mayn't": "may not",
-    "might've": "might have",
-    "mightn't": "might not",
-    "mightn't've": "might not have",
-    "must've": "must have",
-    "mustn't": "must not",
-    "mustn't've": "must not have",
-    "needn't": "need not",
-    "needn't've": "need not have",
-    "o'clock": "of the clock",
-    "oughtn't": "ought not",
-    "oughtn't've": "ought not have",
-    "shan't": "shall not",
-    "sha'n't": "shall not",
-    "shan't've": "shall not have",
-    "she'd": "she had / she would",
-    "she'd've": "she would have",
-    "she'll": "she shall / she will",
-    "she'll've": "she shall have / she will have",
-    "she's": "she has / she is",
-    "should've": "should have",
-    "shouldn't": "should not",
-    "shouldn't've": "should not have",
-    "so've": "so have",
-    "so's": "so as / so is",
-    "that'd": "that would / that had",
-    "that'd've": "that would have",
-    "that's": "that has / that is",
-    "there'd": "there had / there would",
-    "there'd've": "there would have",
-    "there's": "there has / there is",
-    "they'd": "they had / they would",
-    "they'd've": "they would have",
-    "they'll": "they shall / they will",
-    "they'll've": "they shall have / they will have",
-    "they're": "they are",
-    "they've": "they have",
-    "to've": "to have",
-    "wasn't": "was not",
-    "we'd": "we had / we would",
-    "we'd've": "we would have",
-    "we'll": "we will",
-    "we'll've": "we will have",
-    "we're": "we are",
-    "we've": "we have",
-    "weren't": "were not",
-    "what'll": "what shall / what will",
-    "what'll've": "what shall have / what will have",
-    "what're": "what are",
-    "what's": "what has / what is",
-    "what've": "what have",
-    "when's": "when has / when is",
-    "when've": "when have",
-    "where'd": "where did",
-    "where's": "where has / where is",
-    "where've": "where have",
-    "who'll": "who shall / who will",
-    "who'll've": "who shall have / who will have",
-    "who's": "who has / who is",
-    "who've": "who have",
-    "why's": "why has / why is",
-    "why've": "why have",
-    "will've": "will have",
-    "won't": "will not",
-    "won't've": "will not have",
-    "would've": "would have",
-    "wouldn't": "would not",
-    "wouldn't've": "would not have",
-    "y'all": "you all",
-    "y'all'd": "you all would",
-    "y'all'd've": "you all would have",
-    "y'all're": "you all are",
-    "y'all've": "you all have",
-    "you'd": "you had / you would",
-    "you'd've": "you would have",
-    "you'll": "you shall / you will",
-    "you'll've": "you shall have / you will have",
-    "you're": "you are",
-    "you've": "you have",
-}
-
-
-def lookup_dict(txt, dictionary):
-    for word in txt.split():
-        if word.lower() in dictionary:
-            if word.lower() in txt.split():
-                txt = txt.replace(word, dictionary[word.lower()])
-    return txt
-
-
-text = text.apply(lambda x: lookup_dict(x, apostrophe_dict))
-
-# Remove rare words
-from collections import Counter
-from itertools import chain
-
-# split words into lists
-v = text.str.split().tolist()
-# compute global word frequency
-c = Counter(chain.from_iterable(v))
-# filter, join, and re-assign
-text = [" ".join([j for j in i if c[j] > 1]) for i in v]
-text = pd.Series(text)
-
-total_word = 0
-for x, word in enumerate(text):
-    num_word = len(word.split())
-    # print(num_word)
-    total_word = total_word + num_word
-print(total_word)
+    return X_train, X_valid, y_train, y_valid, X_test
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
index e43c6fc3..8ae043ac 100644
--- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
@@ -1,16 +1,23 @@
-import numpy as np
 import pandas as pd
-from sklearn.feature_extraction.text import TfidfVectorizer
 
+"""
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
+"""
 
-class TfidfFeature:
+
+class IdentityFeature:
     def fit(self, train_df: pd.DataFrame):
-        train_df = np.array(train_df).tolist()
-        train_X = list(map("".join, train_df))
-        self.model = TfidfVectorizer(stop_words="english", max_df=0.5, min_df=0.01).fit(train_X)
-        # print(self.model.get_feature_names_out()[:5])
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
 
     def transform(self, X: pd.DataFrame):
-        X = np.array(X).tolist()
-        X = list(map("".join, X))
-        return self.model.transform(X)
+        """
+        Transform the input data.
+        """
+        return X
+
+
+feature_engineering_cls = IdentityFeature
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py
deleted file mode 100644
index f0d15b3c..00000000
--- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import pandas as pd
-from sklearn.multioutput import MultiOutputRegressor
-from sklearn.svm import SVR
-
-
-def select(X: pd.DataFrame) -> pd.DataFrame:
-    return X
-
-
-def fit(X_train: pd.DataFrame, y_train: pd.Series):
-    model = MultiOutputRegressor(SVR())
-    model.fit(X_train, y_train)
-    return model
-
-
-def predict(model: MultiOutputRegressor, X_test: pd.DataFrame):
-    X_test_selected = select(X_test)
-    return model.predict(X_test_selected)
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py
new file mode 100644
index 00000000..0adc37d6
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestRegressor
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    """
+    Define and train the Random Forest model. Merge feature selection into the pipeline.
+    """
+    # Initialize the Random Forest model
+    model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1)
+
+    # Select features (if any feature selection is needed)
+    X_train_selected = select(X_train)
+
+    # Fit the model
+    model.fit(X_train_selected, y_train)
+
+    return model
+
+
+def predict(model, X):
+    """
+    Keep feature selection's consistency and make predictions.
+    """
+    # Select features (if any feature selection is needed)
+    X_selected = select(X)
+
+    # Predict using the trained model
+    y_pred = model.predict(X_selected)
+
+    return y_pred
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py
new file mode 100644
index 00000000..07a30dc5
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py
@@ -0,0 +1,43 @@
+"""
+motivation  of the model
+"""
+
+import pandas as pd
+import xgboost as xgb
+from sklearn.multioutput import MultiOutputRegressor
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    # Ignore feature selection logic
+    return X
+
+
+def is_sparse_df(df: pd.DataFrame) -> bool:
+    # 检查 DataFrame 中的每一列是否为稀疏类型
+    return any(isinstance(dtype, pd.SparseDtype) for dtype in df.dtypes)
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
+    """Define and train the model. Merge feature_select"""
+    X_train = select(X_train)
+
+    xgb_estimator = xgb.XGBRegressor(n_estimators=500, random_state=0, objective="reg:squarederror")
+
+    model = MultiOutputRegressor(xgb_estimator, n_jobs=2)
+
+    if is_sparse_df(X_train):
+        X_train = X_train.sparse.to_coo()
+
+    model.fit(X_train, y_train)
+    return model
+
+
+def predict(model, X_test):
+    """
+    Keep feature select's consistency.
+    """
+    X_test = select(X_test)
+    if is_sparse_df(X_test):
+        X_test = X_test.sparse.to_coo()
+    y_pred = model.predict(X_test)
+    return y_pred
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
index 29d957cb..cd0e88bf 100644
--- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
@@ -1,8 +1,12 @@
-# TODO: fix the train.py
-
 import importlib.util
 from pathlib import Path
 
+import numpy as np
+import pandas as pd
+from fea_share_preprocess import preprocess_script
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
 
 def import_module_from_path(module_name, module_path):
     spec = importlib.util.spec_from_file_location(module_name, module_path)
@@ -11,27 +15,88 @@ def import_module_from_path(module_name, module_path):
     return module
 
 
-DIRNAME = Path(__file__).absolute().resolve().parent
+# 1) Preprocess the data
+X_train, X_valid, y_train, y_valid, X_test = preprocess_script()
 
-y = target
-X = text[: len(train)]
-X_test = text[len(train) :]
+# 2) Auto feature engineering
+X_train_l, X_valid_l = [], []
+X_test_l = []
 
 for f in DIRNAME.glob("feature/feat*.py"):
     cls = import_module_from_path(f.stem, f).feature_engineering_cls()
     cls.fit(X_train)
     X_train_f = cls.transform(X_train)
+    X_valid_f = cls.transform(X_valid)
     X_test_f = cls.transform(X_test)
 
     X_train_l.append(X_train_f)
+    X_valid_l.append(X_valid_f)
     X_test_l.append(X_test_f)
 
+X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
+X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
+X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
+
+
+# 3) Train the model
+def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Flatten the columns of a DataFrame with MultiIndex columns,
+    for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b
+    """
+    if df.columns.nlevels == 1:
+        return df
+    df.columns = ["_".join(str(col)).strip() for col in df.columns.values]
+    return df
+
+
+X_train = flatten_columns(X_train)
+X_valid = flatten_columns(X_valid)
+X_test = flatten_columns(X_test)
+
+
+model_l = []  # list[tuple[model, predict_func]]
+for f in DIRNAME.glob("model/model*.py"):
+    m = import_module_from_path(f.stem, f)
+    model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))
+
+# 4) Evaluate the model on the validation set
+y_valid_pred_l = []
+for model, predict_func in model_l:
+    y_valid_pred = predict_func(model, X_valid)
+    y_valid_pred_l.append(y_valid_pred)
+    # print(y_valid_pred)
+    # print(y_valid_pred.shape)
+
+# 5) Ensemble
+# Majority vote ensemble
+y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0)
+
+
+# 6) Save the validation metrics
+def MCRMSE(y_true, y_pred):
+    return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))
+
+
+metrics = MCRMSE(y_valid, y_valid_pred_ensemble)
+print(f"MCRMSE on valid set: {metrics}")
+pd.Series(data=[metrics], index=["MCRMSE"]).to_csv("submission_score.csv")
+
+# 7) Make predictions on the test set and save them
+y_test_pred_l = []
+for model, predict_func in model_l:
+    y_test_pred_l.append(predict_func(model, X_test))
+
+# For multiclass classification, use the mode of the predictions
+y_test_pred = np.mean(y_test_pred_l, axis=0)
+
 
-submission["cohesion"] = predictions[:, 0]
-submission["syntax"] = predictions[:, 1]
-submission["vocabulary"] = predictions[:, 2]
-submission["phraseology"] = predictions[:, 3]
-submission["grammar"] = predictions[:, 4]
-submission["conventions"] = predictions[:, 5]
+submission_result = pd.read_csv("/kaggle/input/sample_submission.csv")
+submission_result["cohesion"] = y_test_pred[:, 0]
+submission_result["syntax"] = y_test_pred[:, 1]
+submission_result["vocabulary"] = y_test_pred[:, 2]
+submission_result["phraseology"] = y_test_pred[:, 3]
+submission_result["grammar"] = y_test_pred[:, 4]
+submission_result["conventions"] = y_test_pred[:, 5]
 
-submission.to_csv("submission.csv", index=False)  # writing data to a CSV file
+submission_result.to_csv("submission.csv", index=False)
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py
deleted file mode 100644
index 90ec0c2a..00000000
--- a/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-from sklearn.model_selection import KFold
-from sklearn.preprocessing import LabelEncoder
-
-from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess
-
-
-def compute_metrics_for_classification(y_true, y_pred):
-    """Compute MCC for classification."""
-    from sklearn.metrics import matthews_corrcoef
-
-    return matthews_corrcoef(y_true, y_pred)
-
-
-def perform_kfold_cross_validation(X, y, n_splits=2, random_seed=42):
-    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
-    fold_metrics = []
-
-    DIRNAME = Path(__file__).absolute().resolve().parent
-
-    for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
-        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
-        y_train_fold, y_valid_fold = y[train_idx], y[valid_idx]
-
-        # TODO: Preprocess and Feature Engineering before K-Fold CV
-
-        # Preprocess the data
-        X_train_fold = preprocess(X_train_fold)
-        X_valid_fold = preprocess(X_valid_fold)
-
-        # Feature Engineering
-        X_train_l_fold, X_valid_l_fold = [], []
-        for f in DIRNAME.glob("feat*.py"):
-            m = __import__(f.name.strip(".py"))
-            X_train_fold = m.feat_eng(X_train_fold)
-            X_valid_fold = m.feat_eng(X_valid_fold)
-
-            X_train_l_fold.append(X_train_fold)
-            X_valid_l_fold.append(X_valid_fold)
-
-        X_train_fold = pd.concat(X_train_l_fold, axis=1)
-        X_valid_fold = pd.concat(X_valid_l_fold, axis=1)
-
-        # Align features
-        X_valid_fold = X_valid_fold.reindex(columns=X_train_fold.columns, fill_value=0)
-
-        # Train and evaluate models
-        mcc_scores = []
-        model_l = []  # Reinitialize model list
-        for f in DIRNAME.glob("model*.py"):
-            m = __import__(f.name.strip(".py"))
-            model = m.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold)
-            y_valid_pred = m.predict(model, X_valid_fold)
-            mcc = compute_metrics_for_classification(y_valid_fold, y_valid_pred)
-            mcc_scores.append(mcc)
-            print(f"Fold {fold+1}, Model {f.name}: MCC = {mcc}")
-
-        # Store the average MCC score for this fold
-        avg_mcc = np.mean(mcc_scores)
-        fold_metrics.append(avg_mcc)
-        print(f"Fold {fold+1} average MCC: {avg_mcc}")
-
-    # Calculate the overall average MCC
-    overall_avg_mcc = np.mean(fold_metrics)
-    result_df = pd.DataFrame({"Overall Average MCC": [overall_avg_mcc]})
-    result_df.to_csv(f"path/to/playground-series-s4e8/cv_score_{f.name.strip('.py')}.csv", index=False)
-
-    print(f"Overall Average MCC across all folds: {overall_avg_mcc}")
-    return overall_avg_mcc
-
-
-# This allows the script to be run directly
-if __name__ == "__main__":
-    # Load and preprocess the data
-    data_df = pd.read_csv("path/to/playground-series-s4e8/train.csv")
-    data_df = data_df.drop(["id"], axis=1)
-
-    X = data_df.drop(["class"], axis=1)
-    y = data_df[["class"]]
-
-    label_encoder = LabelEncoder()
-    # transfrom y to 1D
-    y = label_encoder.fit_transform(y)
-    result = perform_kfold_cross_validation(X, y)