# 1. Import library import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder # 2. Baca dataset asli (yang bersih, bukan _noise) df = pd.read_csv('dataset_klasifikasi_modul9_clean.csv') # 3. Kalau datamu masih 1 kolom panjang, gunakan split ini. # Kalau sudah 7 kolom, bagian ini boleh dilewati. if df.shape[1] == 1: df = df.iloc[:, 0].str.replace('"', '', regex=False).str.split(',', expand=True) df.columns = ['CustomerID','Usia','Pendapatan','FrekuensiBelanja','TotalBelanja','TrenPenjualan','Respon_Promo'] # Handle the header row issue and convert to numeric types for the main df df = df.iloc[1:] # Drop the header row if it's still present as data numeric_cols = ['Usia','Pendapatan','FrekuensiBelanja','TotalBelanja','Respon_Promo'] for col in numeric_cols: df[col] = pd.to_numeric(df[col], errors='coerce') df.dropna(inplace=True) # 4. Encode kolom kategori TrenPenjualan encoder = LabelEncoder() df['TrenPenjualan_encoded'] = encoder.fit_transform(df['TrenPenjualan']) # 5. BUAT DATAFRAME NOISE DI SINI df_noisy = df.copy() # <-- INI YANG PENTING, MEMBUAT df_noisy # 5a. Noise numerik pada Pendapatan (±5%) df_noisy['Pendapatan'] = ( df_noisy['Pendapatan'] * (1 + np.random.uniform(-0.05, 0.05, size=len(df_noisy))) ).astype(int) # 5b. Noise numerik pada TotalBelanja (±10%) df_noisy['TotalBelanja'] = ( df_noisy['TotalBelanja'] * (1 + np.random.uniform(-0.10, 0.10, size=len(df_noisy))) ).astype(int) # 5c. Noise label Respon_Promo (10% baris di-flip) n_noise = int(0.10 * len(df_noisy)) idx_noise = np.random.choice(df_noisy.index, size=n_noise, replace=False) df_noisy.loc[idx_noise, 'Respon_Promo'] = 1 - df_noisy.loc[idx_noise, 'Respon_Promo'] # 6. Cek sekilas print(df_noisy.head()) print(df_noisy['Respon_Promo'].value_counts(normalize=True))