Train-test split
# The original dataframe 'df' has the header row included as the first data entry (index 0).
# This causes issues with data types and the 'Respon_Promo' column having a unique string value.
# We need to clean the dataframe first before proceeding with train-test split.

# Make a copy to avoid modifying the global df directly if it's used elsewhere
df_cleaned = df.copy()

# Drop the first row which contains the column headers as data
df_cleaned = df_cleaned.iloc[1:]

# Convert relevant columns to numeric types, coercing errors to NaN
numeric_cols = ['Usia', 'Pendapatan', 'FrekuensiBelanja', 'TotalBelanja', 'Respon_Promo']
for col in numeric_cols:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')

# Drop rows with NaN values if any arise from coercion (e.g., if there were non-numeric data besides headers)
df_cleaned.dropna(inplace=True)

# Re-encode 'TrenPenjualan' based on the cleaned dataframe
# This column was encoded in a previous cell, but the original 'df' contained the header as data.
# So, 'TrenPenjualan_encoded' for the first row of `X` was also incorrect.
encoder = LabelEncoder()
df_cleaned['TrenPenjualan_encoded'] = encoder.fit_transform(df_cleaned['TrenPenjualan'])

# Redefine features (X) and target (y) using the cleaned dataframe
X = df_cleaned[['Usia', 'Pendapatan', 'FrekuensiBelanja', 'TotalBelanja', 'TrenPenjualan_encoded']]
y = df_cleaned['Respon_Promo']

# Now perform the train-test split with the cleaned data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)