Skip to content

Commit

Permalink
fix: a bug of developer& edit s4e8 template (#338)
Browse files Browse the repository at this point in the history
* s4e8 preprocess remove onehot & fix a bug

* Update runner.py

* Update fea_share_preprocess.py

* Update runner.py

---------

Co-authored-by: WinstonLiyt <[email protected]>
  • Loading branch information
TPLin22 and WinstonLiyt authored Sep 25, 2024
1 parent c86afad commit f12ce72
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 25 deletions.
3 changes: 2 additions & 1 deletion rdagent/scenarios/kaggle/developer/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
self.build_from_SOTA(exp)

sub_ws = exp.sub_workspace_list[0]
model_type = sub_ws.target_task.model_type
# TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list. Hardcoded now.
model_type = sub_ws.target_task.model_type[0]

if sub_ws.code_dict == {}:
raise ModelEmptyError("No model is implemented.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder


def prepreprocess():
Expand Down Expand Up @@ -40,42 +40,30 @@ def preprocess_fit(X_train: pd.DataFrame):
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
]
)

numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
transformers=[
("cat", categorical_transformer, categorical_cols),
("num", numerical_transformer, numerical_cols),
("cat", categorical_transformer, categorical_cols),
]
)

# Fit the preprocessor on the training data
preprocessor.fit(X_train)

return preprocessor

return preprocessor, numerical_cols, categorical_cols

def preprocess_transform(X: pd.DataFrame, preprocessor):
"""
Transforms the given DataFrame using the fitted preprocessor.
Ensures the processed data has consistent features across train, validation, and test sets.
"""
# Transform the data using the fitted preprocessor
X_array = preprocessor.transform(X).toarray()

# Get feature names for the columns in the transformed data
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
feature_names = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(
categorical_cols
).tolist() + [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]]
def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
X_transformed = preprocessor.transform(X)

# Convert arrays back to DataFrames
X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index)
X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)

return X_transformed

Expand All @@ -96,16 +84,16 @@ def preprocess_script():
X_train, X_valid, y_train, y_valid = prepreprocess()

# Fit the preprocessor on the training data
preprocessor = preprocess_fit(X_train)
preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)

# Preprocess the train, validation, and test data
X_train = preprocess_transform(X_train, preprocessor)
X_valid = preprocess_transform(X_valid, preprocessor)
X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)

# Load and preprocess the test data
submission_df = pd.read_csv("/kaggle/input/test.csv")
passenger_ids = submission_df["id"]
submission_df = submission_df.drop(["id"], axis=1)
X_test = preprocess_transform(submission_df, preprocessor)
X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)

return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
import pandas as pd
from fea_share_preprocess import preprocess_script
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn.metrics import matthews_corrcoef
from sklearn.preprocessing import LabelEncoder

# Set random seed for reproducibility
Expand Down

0 comments on commit f12ce72

Please sign in to comment.