feat: add kaggle tpl: feedback-prize (#331)

* change feedback tpl * feedback tpl changes * fix feedback tpl * fix train.py of feedback tpl * add rf model for feedback tpl * fix CI
microsoft · Sep 26, 2024 · a288e39 · a288e39
1 parent 034f238
commit a288e39
Show file tree

Hide file tree

Showing 7 changed files with 211 additions and 311 deletions.
diff --git a/...ggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py b/...ggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
@@ -1,198 +1,48 @@
-# TODO: Fix
+import os
 import re
 
 import numpy as np  # linear algebra
 import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
 
-train = pd.read_csv("/kaggle/input/train.csv")
-test = pd.read_csv("/kaggle/input/test.csv")
-submission = pd.read_csv("/kaggle/input/sample_submission.csv")
 
+def preprocess_script():
+    """
+    This method applies the preprocessing steps to the training, validation, and test datasets.
+    """
+    if os.path.exists("/kaggle/input/X_train.pkl"):
+        X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
+        X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
+        y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
+        y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
+        X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
+        others = pd.read_pickle("/kaggle/input/others.pkl")
 
-features = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
-target = train[features]
+        return X_train, X_valid, y_train, y_valid, X_test, *others
 
+    def data_cleaner(text):
+        text = text.strip()
+        text = re.sub(r"\n", "", text)
+        text = text.lower()
+        return text
 
-text_train = train["full_text"]
-text_test = test["full_text"]
+    # train
+    train = pd.read_csv("/kaggle/input/train.csv")
+    test = pd.read_csv("/kaggle/input/test.csv")
 
-text = pd.concat([text_train, text_test], ignore_index=True)
+    train["full_text"] = train["full_text"].apply(data_cleaner)
+    test["full_text"] = test["full_text"].apply(data_cleaner)
 
+    y_train = train[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]
 
-count_words = text.str.findall(r"(\w+)").str.len()
-print(count_words.sum())
+    vectorizer = TfidfVectorizer()
+    X_train = vectorizer.fit_transform(train["full_text"])
+    X_test = vectorizer.transform(test["full_text"])
 
+    X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
+    X_test = pd.DataFrame.sparse.from_spmatrix(X_test)
 
-""" Cleaning Text """
-text = text.str.lower()
+    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
 
-# removing special characters and numbers
-text = text.apply(lambda x: re.sub("[^a-z]\s", "", x))
-
-# remove hash tags
-text = text.str.replace("#", "")
-
-# remove words less than 3 character and greater than 7
-text = text.apply(lambda x: " ".join([w for w in x.split() if len(w) > 2 and len(w) < 8]))
-
-# removing stopwords
-# text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))
-
-count_words = text.str.findall(r"(\w+)").str.len()
-print(count_words.sum())
-
-
-most_freq_words = pd.Series(" ".join(text).lower().split()).value_counts()[:25]
-text = text.apply(lambda x: " ".join(word for word in x.split() if word not in most_freq_words))
-
-count_words = text.str.findall(r"(\w+)").str.len()
-
-apostrophe_dict = {
-    "ain't": "am not / are not",
-    "aren't": "are not / am not",
-    "can't": "cannot",
-    "can't've": "cannot have",
-    "'cause": "because",
-    "could've": "could have",
-    "couldn't": "could not",
-    "couldn't've": "could not have",
-    "didn't": "did not",
-    "doesn't": "does not",
-    "don't": "do not",
-    "hadn't": "had not",
-    "hadn't've": "had not have",
-    "hasn't": "has not",
-    "haven't": "have not",
-    "he'd": "he had / he would",
-    "he'd've": "he would have",
-    "he'll": "he shall / he will",
-    "he'll've": "he shall have / he will have",
-    "he's": "he has / he is",
-    "how'd": "how did",
-    "how'd'y": "how do you",
-    "how'll": "how will",
-    "how's": "how has / how is",
-    "i'd": "I had / I would",
-    "i'd've": "I would have",
-    "i'll": "I shall / I will",
-    "i'll've": "I shall have / I will have",
-    "i'm": "I am",
-    "i've": "I have",
-    "isn't": "is not",
-    "it'd": "it had / it would",
-    "it'd've": "it would have",
-    "it'll": "it shall / it will",
-    "it'll've": "it shall have / it will have",
-    "it's": "it has / it is",
-    "let's": "let us",
-    "ma'am": "madam",
-    "mayn't": "may not",
-    "might've": "might have",
-    "mightn't": "might not",
-    "mightn't've": "might not have",
-    "must've": "must have",
-    "mustn't": "must not",
-    "mustn't've": "must not have",
-    "needn't": "need not",
-    "needn't've": "need not have",
-    "o'clock": "of the clock",
-    "oughtn't": "ought not",
-    "oughtn't've": "ought not have",
-    "shan't": "shall not",
-    "sha'n't": "shall not",
-    "shan't've": "shall not have",
-    "she'd": "she had / she would",
-    "she'd've": "she would have",
-    "she'll": "she shall / she will",
-    "she'll've": "she shall have / she will have",
-    "she's": "she has / she is",
-    "should've": "should have",
-    "shouldn't": "should not",
-    "shouldn't've": "should not have",
-    "so've": "so have",
-    "so's": "so as / so is",
-    "that'd": "that would / that had",
-    "that'd've": "that would have",
-    "that's": "that has / that is",
-    "there'd": "there had / there would",
-    "there'd've": "there would have",
-    "there's": "there has / there is",
-    "they'd": "they had / they would",
-    "they'd've": "they would have",
-    "they'll": "they shall / they will",
-    "they'll've": "they shall have / they will have",
-    "they're": "they are",
-    "they've": "they have",
-    "to've": "to have",
-    "wasn't": "was not",
-    "we'd": "we had / we would",
-    "we'd've": "we would have",
-    "we'll": "we will",
-    "we'll've": "we will have",
-    "we're": "we are",
-    "we've": "we have",
-    "weren't": "were not",
-    "what'll": "what shall / what will",
-    "what'll've": "what shall have / what will have",
-    "what're": "what are",
-    "what's": "what has / what is",
-    "what've": "what have",
-    "when's": "when has / when is",
-    "when've": "when have",
-    "where'd": "where did",
-    "where's": "where has / where is",
-    "where've": "where have",
-    "who'll": "who shall / who will",
-    "who'll've": "who shall have / who will have",
-    "who's": "who has / who is",
-    "who've": "who have",
-    "why's": "why has / why is",
-    "why've": "why have",
-    "will've": "will have",
-    "won't": "will not",
-    "won't've": "will not have",
-    "would've": "would have",
-    "wouldn't": "would not",
-    "wouldn't've": "would not have",
-    "y'all": "you all",
-    "y'all'd": "you all would",
-    "y'all'd've": "you all would have",
-    "y'all're": "you all are",
-    "y'all've": "you all have",
-    "you'd": "you had / you would",
-    "you'd've": "you would have",
-    "you'll": "you shall / you will",
-    "you'll've": "you shall have / you will have",
-    "you're": "you are",
-    "you've": "you have",
-}
-
-
-def lookup_dict(txt, dictionary):
-    for word in txt.split():
-        if word.lower() in dictionary:
-            if word.lower() in txt.split():
-                txt = txt.replace(word, dictionary[word.lower()])
-    return txt
-
-
-text = text.apply(lambda x: lookup_dict(x, apostrophe_dict))
-
-# Remove rare words
-from collections import Counter
-from itertools import chain
-
-# split words into lists
-v = text.str.split().tolist()
-# compute global word frequency
-c = Counter(chain.from_iterable(v))
-# filter, join, and re-assign
-text = [" ".join([j for j in i if c[j] > 1]) for i in v]
-text = pd.Series(text)
-
-total_word = 0
-for x, word in enumerate(text):
-    num_word = len(word.split())
-    # print(num_word)
-    total_word = total_word + num_word
-print(total_word)
+    return X_train, X_valid, y_train, y_valid, X_test
diff --git a/...os/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py b/...os/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
@@ -1,16 +1,23 @@
-import numpy as np
 import pandas as pd
-from sklearn.feature_extraction.text import TfidfVectorizer
 
+"""
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
+"""
 
-class TfidfFeature:
+
+class IdentityFeature:
     def fit(self, train_df: pd.DataFrame):
-        train_df = np.array(train_df).tolist()
-        train_X = list(map("".join, train_df))
-        self.model = TfidfVectorizer(stop_words="english", max_df=0.5, min_df=0.01).fit(train_X)
-        # print(self.model.get_feature_names_out()[:5])
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
 
     def transform(self, X: pd.DataFrame):
-        X = np.array(X).tolist()
-        X = list(map("".join, X))
-        return self.model.transform(X)
+        """
+        Transform the input data.
+        """
+        return X
+
+
+feature_engineering_cls = IdentityFeature
diff --git a/...narios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py b/...narios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py
diff --git a/.../experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py b/.../experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestRegressor
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    """
+    Define and train the Random Forest model. Merge feature selection into the pipeline.
+    """
+    # Initialize the Random Forest model
+    model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1)
+
+    # Select features (if any feature selection is needed)
+    X_train_selected = select(X_train)
+
+    # Fit the model
+    model.fit(X_train_selected, y_train)
+
+    return model
+
+
+def predict(model, X):
+    """
+    Keep feature selection's consistency and make predictions.
+    """
+    # Select features (if any feature selection is needed)
+    X_selected = select(X)
+
+    # Predict using the trained model
+    y_pred = model.predict(X_selected)
+
+    return y_pred
diff --git a/...aggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py b/...aggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py
@@ -0,0 +1,43 @@
+"""
+motivation  of the model
+"""
+
+import pandas as pd
+import xgboost as xgb
+from sklearn.multioutput import MultiOutputRegressor
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    # Ignore feature selection logic
+    return X
+
+
+def is_sparse_df(df: pd.DataFrame) -> bool:
+    # 检查 DataFrame 中的每一列是否为稀疏类型
+    return any(isinstance(dtype, pd.SparseDtype) for dtype in df.dtypes)
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
+    """Define and train the model. Merge feature_select"""
+    X_train = select(X_train)
+
+    xgb_estimator = xgb.XGBRegressor(n_estimators=500, random_state=0, objective="reg:squarederror")
+
+    model = MultiOutputRegressor(xgb_estimator, n_jobs=2)
+
+    if is_sparse_df(X_train):
+        X_train = X_train.sparse.to_coo()
+
+    model.fit(X_train, y_train)
+    return model
+
+
+def predict(model, X_test):
+    """
+    Keep feature select's consistency.
+    """
+    X_test = select(X_test)
+    if is_sparse_df(X_test):
+        X_test = X_test.sparse.to_coo()
+    y_pred = model.predict(X_test)
+    return y_pred