From a288e399e6b0beec62729bd7d46b98a55de5ab79 Mon Sep 17 00:00:00 2001 From: XianBW <36835909+XianBW@users.noreply.github.com> Date: Thu, 26 Sep 2024 09:08:25 +0800 Subject: [PATCH] feat: add kaggle tpl: feedback-prize (#331) * change feedback tpl * feedback tpl changes * fix feedback tpl * fix train.py of feedback tpl * add rf model for feedback tpl * fix CI --- .../fea_share_preprocess.py | 216 +++--------------- .../feature/feature.py | 27 ++- .../model/model.py | 18 -- .../model/model_randomforest.py | 40 ++++ .../model/model_xgboost.py | 43 ++++ .../train.py | 91 ++++++-- .../meta_tpl/cross_validation_tpl.py | 87 ------- 7 files changed, 211 insertions(+), 311 deletions(-) delete mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py delete mode 100644 rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py index 43b4beb4..6222a236 100644 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py @@ -1,198 +1,48 @@ -# TODO: Fix +import os import re import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split -train = pd.read_csv("/kaggle/input/train.csv") -test = pd.read_csv("/kaggle/input/test.csv") -submission = pd.read_csv("/kaggle/input/sample_submission.csv") +def preprocess_script(): + """ + This method applies the preprocessing steps to the training, validation, and test datasets. + """ + if os.path.exists("/kaggle/input/X_train.pkl"): + X_train = pd.read_pickle("/kaggle/input/X_train.pkl") + X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") + y_train = pd.read_pickle("/kaggle/input/y_train.pkl") + y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") + X_test = pd.read_pickle("/kaggle/input/X_test.pkl") + others = pd.read_pickle("/kaggle/input/others.pkl") -features = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"] -target = train[features] + return X_train, X_valid, y_train, y_valid, X_test, *others + def data_cleaner(text): + text = text.strip() + text = re.sub(r"\n", "", text) + text = text.lower() + return text -text_train = train["full_text"] -text_test = test["full_text"] + # train + train = pd.read_csv("/kaggle/input/train.csv") + test = pd.read_csv("/kaggle/input/test.csv") -text = pd.concat([text_train, text_test], ignore_index=True) + train["full_text"] = train["full_text"].apply(data_cleaner) + test["full_text"] = test["full_text"].apply(data_cleaner) + y_train = train[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]] -count_words = text.str.findall(r"(\w+)").str.len() -print(count_words.sum()) + vectorizer = TfidfVectorizer() + X_train = vectorizer.fit_transform(train["full_text"]) + X_test = vectorizer.transform(test["full_text"]) + X_train = pd.DataFrame.sparse.from_spmatrix(X_train) + X_test = pd.DataFrame.sparse.from_spmatrix(X_test) -""" Cleaning Text """ -text = text.str.lower() + X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42) -# removing special characters and numbers -text = text.apply(lambda x: re.sub("[^a-z]\s", "", x)) - -# remove hash tags -text = text.str.replace("#", "") - -# remove words less than 3 character and greater than 7 -text = text.apply(lambda x: " ".join([w for w in x.split() if len(w) > 2 and len(w) < 8])) - -# removing stopwords -# text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords )) - -count_words = text.str.findall(r"(\w+)").str.len() -print(count_words.sum()) - - -most_freq_words = pd.Series(" ".join(text).lower().split()).value_counts()[:25] -text = text.apply(lambda x: " ".join(word for word in x.split() if word not in most_freq_words)) - -count_words = text.str.findall(r"(\w+)").str.len() - -apostrophe_dict = { - "ain't": "am not / are not", - "aren't": "are not / am not", - "can't": "cannot", - "can't've": "cannot have", - "'cause": "because", - "could've": "could have", - "couldn't": "could not", - "couldn't've": "could not have", - "didn't": "did not", - "doesn't": "does not", - "don't": "do not", - "hadn't": "had not", - "hadn't've": "had not have", - "hasn't": "has not", - "haven't": "have not", - "he'd": "he had / he would", - "he'd've": "he would have", - "he'll": "he shall / he will", - "he'll've": "he shall have / he will have", - "he's": "he has / he is", - "how'd": "how did", - "how'd'y": "how do you", - "how'll": "how will", - "how's": "how has / how is", - "i'd": "I had / I would", - "i'd've": "I would have", - "i'll": "I shall / I will", - "i'll've": "I shall have / I will have", - "i'm": "I am", - "i've": "I have", - "isn't": "is not", - "it'd": "it had / it would", - "it'd've": "it would have", - "it'll": "it shall / it will", - "it'll've": "it shall have / it will have", - "it's": "it has / it is", - "let's": "let us", - "ma'am": "madam", - "mayn't": "may not", - "might've": "might have", - "mightn't": "might not", - "mightn't've": "might not have", - "must've": "must have", - "mustn't": "must not", - "mustn't've": "must not have", - "needn't": "need not", - "needn't've": "need not have", - "o'clock": "of the clock", - "oughtn't": "ought not", - "oughtn't've": "ought not have", - "shan't": "shall not", - "sha'n't": "shall not", - "shan't've": "shall not have", - "she'd": "she had / she would", - "she'd've": "she would have", - "she'll": "she shall / she will", - "she'll've": "she shall have / she will have", - "she's": "she has / she is", - "should've": "should have", - "shouldn't": "should not", - "shouldn't've": "should not have", - "so've": "so have", - "so's": "so as / so is", - "that'd": "that would / that had", - "that'd've": "that would have", - "that's": "that has / that is", - "there'd": "there had / there would", - "there'd've": "there would have", - "there's": "there has / there is", - "they'd": "they had / they would", - "they'd've": "they would have", - "they'll": "they shall / they will", - "they'll've": "they shall have / they will have", - "they're": "they are", - "they've": "they have", - "to've": "to have", - "wasn't": "was not", - "we'd": "we had / we would", - "we'd've": "we would have", - "we'll": "we will", - "we'll've": "we will have", - "we're": "we are", - "we've": "we have", - "weren't": "were not", - "what'll": "what shall / what will", - "what'll've": "what shall have / what will have", - "what're": "what are", - "what's": "what has / what is", - "what've": "what have", - "when's": "when has / when is", - "when've": "when have", - "where'd": "where did", - "where's": "where has / where is", - "where've": "where have", - "who'll": "who shall / who will", - "who'll've": "who shall have / who will have", - "who's": "who has / who is", - "who've": "who have", - "why's": "why has / why is", - "why've": "why have", - "will've": "will have", - "won't": "will not", - "won't've": "will not have", - "would've": "would have", - "wouldn't": "would not", - "wouldn't've": "would not have", - "y'all": "you all", - "y'all'd": "you all would", - "y'all'd've": "you all would have", - "y'all're": "you all are", - "y'all've": "you all have", - "you'd": "you had / you would", - "you'd've": "you would have", - "you'll": "you shall / you will", - "you'll've": "you shall have / you will have", - "you're": "you are", - "you've": "you have", -} - - -def lookup_dict(txt, dictionary): - for word in txt.split(): - if word.lower() in dictionary: - if word.lower() in txt.split(): - txt = txt.replace(word, dictionary[word.lower()]) - return txt - - -text = text.apply(lambda x: lookup_dict(x, apostrophe_dict)) - -# Remove rare words -from collections import Counter -from itertools import chain - -# split words into lists -v = text.str.split().tolist() -# compute global word frequency -c = Counter(chain.from_iterable(v)) -# filter, join, and re-assign -text = [" ".join([j for j in i if c[j] > 1]) for i in v] -text = pd.Series(text) - -total_word = 0 -for x, word in enumerate(text): - num_word = len(word.split()) - # print(num_word) - total_word = total_word + num_word -print(total_word) + return X_train, X_valid, y_train, y_valid, X_test diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py index e43c6fc3..8ae043ac 100644 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py @@ -1,16 +1,23 @@ -import numpy as np import pandas as pd -from sklearn.feature_extraction.text import TfidfVectorizer +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" -class TfidfFeature: + +class IdentityFeature: def fit(self, train_df: pd.DataFrame): - train_df = np.array(train_df).tolist() - train_X = list(map("".join, train_df)) - self.model = TfidfVectorizer(stop_words="english", max_df=0.5, min_df=0.01).fit(train_X) - # print(self.model.get_feature_names_out()[:5]) + """ + Fit the feature engineering model to the training data. + """ + pass def transform(self, X: pd.DataFrame): - X = np.array(X).tolist() - X = list(map("".join, X)) - return self.model.transform(X) + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py deleted file mode 100644 index f0d15b3c..00000000 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from sklearn.multioutput import MultiOutputRegressor -from sklearn.svm import SVR - - -def select(X: pd.DataFrame) -> pd.DataFrame: - return X - - -def fit(X_train: pd.DataFrame, y_train: pd.Series): - model = MultiOutputRegressor(SVR()) - model.fit(X_train, y_train) - return model - - -def predict(model: MultiOutputRegressor, X_test: pd.DataFrame): - X_test_selected = select(X_test) - return model.predict(X_test_selected) diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py new file mode 100644 index 00000000..0adc37d6 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py @@ -0,0 +1,40 @@ +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestRegressor + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + return X + + +def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): + """ + Define and train the Random Forest model. Merge feature selection into the pipeline. + """ + # Initialize the Random Forest model + model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1) + + # Select features (if any feature selection is needed) + X_train_selected = select(X_train) + + # Fit the model + model.fit(X_train_selected, y_train) + + return model + + +def predict(model, X): + """ + Keep feature selection's consistency and make predictions. + """ + # Select features (if any feature selection is needed) + X_selected = select(X) + + # Predict using the trained model + y_pred = model.predict(X_selected) + + return y_pred diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py new file mode 100644 index 00000000..07a30dc5 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py @@ -0,0 +1,43 @@ +""" +motivation of the model +""" + +import pandas as pd +import xgboost as xgb +from sklearn.multioutput import MultiOutputRegressor + + +def select(X: pd.DataFrame) -> pd.DataFrame: + # Ignore feature selection logic + return X + + +def is_sparse_df(df: pd.DataFrame) -> bool: + # 检查 DataFrame 中的每一列是否为稀疏类型 + return any(isinstance(dtype, pd.SparseDtype) for dtype in df.dtypes) + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the model. Merge feature_select""" + X_train = select(X_train) + + xgb_estimator = xgb.XGBRegressor(n_estimators=500, random_state=0, objective="reg:squarederror") + + model = MultiOutputRegressor(xgb_estimator, n_jobs=2) + + if is_sparse_df(X_train): + X_train = X_train.sparse.to_coo() + + model.fit(X_train, y_train) + return model + + +def predict(model, X_test): + """ + Keep feature select's consistency. + """ + X_test = select(X_test) + if is_sparse_df(X_test): + X_test = X_test.sparse.to_coo() + y_pred = model.predict(X_test) + return y_pred diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py index 29d957cb..cd0e88bf 100644 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py @@ -1,8 +1,12 @@ -# TODO: fix the train.py - import importlib.util from pathlib import Path +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script + +DIRNAME = Path(__file__).absolute().resolve().parent + def import_module_from_path(module_name, module_path): spec = importlib.util.spec_from_file_location(module_name, module_path) @@ -11,27 +15,88 @@ def import_module_from_path(module_name, module_path): return module -DIRNAME = Path(__file__).absolute().resolve().parent +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test = preprocess_script() -y = target -X = text[: len(train)] -X_test = text[len(train) :] +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] for f in DIRNAME.glob("feature/feat*.py"): cls = import_module_from_path(f.stem, f).feature_engineering_cls() cls.fit(X_train) X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) X_test_f = cls.transform(X_test) X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) X_test_l.append(X_test_f) +X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) +X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) +X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + + +# 3) Train the model +def flatten_columns(df: pd.DataFrame) -> pd.DataFrame: + """ + Flatten the columns of a DataFrame with MultiIndex columns, + for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b + """ + if df.columns.nlevels == 1: + return df + df.columns = ["_".join(str(col)).strip() for col in df.columns.values] + return df + + +X_train = flatten_columns(X_train) +X_valid = flatten_columns(X_valid) +X_test = flatten_columns(X_test) + + +model_l = [] # list[tuple[model, predict_func]] +for f in DIRNAME.glob("model/model*.py"): + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict)) + +# 4) Evaluate the model on the validation set +y_valid_pred_l = [] +for model, predict_func in model_l: + y_valid_pred = predict_func(model, X_valid) + y_valid_pred_l.append(y_valid_pred) + # print(y_valid_pred) + # print(y_valid_pred.shape) + +# 5) Ensemble +# Majority vote ensemble +y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0) + + +# 6) Save the validation metrics +def MCRMSE(y_true, y_pred): + return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0))) + + +metrics = MCRMSE(y_valid, y_valid_pred_ensemble) +print(f"MCRMSE on valid set: {metrics}") +pd.Series(data=[metrics], index=["MCRMSE"]).to_csv("submission_score.csv") + +# 7) Make predictions on the test set and save them +y_test_pred_l = [] +for model, predict_func in model_l: + y_test_pred_l.append(predict_func(model, X_test)) + +# For multiclass classification, use the mode of the predictions +y_test_pred = np.mean(y_test_pred_l, axis=0) + -submission["cohesion"] = predictions[:, 0] -submission["syntax"] = predictions[:, 1] -submission["vocabulary"] = predictions[:, 2] -submission["phraseology"] = predictions[:, 3] -submission["grammar"] = predictions[:, 4] -submission["conventions"] = predictions[:, 5] +submission_result = pd.read_csv("/kaggle/input/sample_submission.csv") +submission_result["cohesion"] = y_test_pred[:, 0] +submission_result["syntax"] = y_test_pred[:, 1] +submission_result["vocabulary"] = y_test_pred[:, 2] +submission_result["phraseology"] = y_test_pred[:, 3] +submission_result["grammar"] = y_test_pred[:, 4] +submission_result["conventions"] = y_test_pred[:, 5] -submission.to_csv("submission.csv", index=False) # writing data to a CSV file +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py deleted file mode 100644 index 90ec0c2a..00000000 --- a/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py +++ /dev/null @@ -1,87 +0,0 @@ -from pathlib import Path - -import numpy as np -import pandas as pd -from sklearn.model_selection import KFold -from sklearn.preprocessing import LabelEncoder - -from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess - - -def compute_metrics_for_classification(y_true, y_pred): - """Compute MCC for classification.""" - from sklearn.metrics import matthews_corrcoef - - return matthews_corrcoef(y_true, y_pred) - - -def perform_kfold_cross_validation(X, y, n_splits=2, random_seed=42): - kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed) - fold_metrics = [] - - DIRNAME = Path(__file__).absolute().resolve().parent - - for fold, (train_idx, valid_idx) in enumerate(kf.split(X)): - X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx] - y_train_fold, y_valid_fold = y[train_idx], y[valid_idx] - - # TODO: Preprocess and Feature Engineering before K-Fold CV - - # Preprocess the data - X_train_fold = preprocess(X_train_fold) - X_valid_fold = preprocess(X_valid_fold) - - # Feature Engineering - X_train_l_fold, X_valid_l_fold = [], [] - for f in DIRNAME.glob("feat*.py"): - m = __import__(f.name.strip(".py")) - X_train_fold = m.feat_eng(X_train_fold) - X_valid_fold = m.feat_eng(X_valid_fold) - - X_train_l_fold.append(X_train_fold) - X_valid_l_fold.append(X_valid_fold) - - X_train_fold = pd.concat(X_train_l_fold, axis=1) - X_valid_fold = pd.concat(X_valid_l_fold, axis=1) - - # Align features - X_valid_fold = X_valid_fold.reindex(columns=X_train_fold.columns, fill_value=0) - - # Train and evaluate models - mcc_scores = [] - model_l = [] # Reinitialize model list - for f in DIRNAME.glob("model*.py"): - m = __import__(f.name.strip(".py")) - model = m.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold) - y_valid_pred = m.predict(model, X_valid_fold) - mcc = compute_metrics_for_classification(y_valid_fold, y_valid_pred) - mcc_scores.append(mcc) - print(f"Fold {fold+1}, Model {f.name}: MCC = {mcc}") - - # Store the average MCC score for this fold - avg_mcc = np.mean(mcc_scores) - fold_metrics.append(avg_mcc) - print(f"Fold {fold+1} average MCC: {avg_mcc}") - - # Calculate the overall average MCC - overall_avg_mcc = np.mean(fold_metrics) - result_df = pd.DataFrame({"Overall Average MCC": [overall_avg_mcc]}) - result_df.to_csv(f"path/to/playground-series-s4e8/cv_score_{f.name.strip('.py')}.csv", index=False) - - print(f"Overall Average MCC across all folds: {overall_avg_mcc}") - return overall_avg_mcc - - -# This allows the script to be run directly -if __name__ == "__main__": - # Load and preprocess the data - data_df = pd.read_csv("path/to/playground-series-s4e8/train.csv") - data_df = data_df.drop(["id"], axis=1) - - X = data_df.drop(["class"], axis=1) - y = data_df[["class"]] - - label_encoder = LabelEncoder() - # transfrom y to 1D - y = label_encoder.fit_transform(y) - result = perform_kfold_cross_validation(X, y)