In [ ]:
!pip install scikit-learn-extra
Collecting scikit-learn-extra Downloading scikit_learn_extra-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 6.4 MB/s eta 0:00:00 Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn-extra) (1.25.2) Requirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn-extra) (1.11.4) Requirement already satisfied: scikit-learn>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn-extra) (1.2.2) Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (1.4.2) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (3.5.0) Installing collected packages: scikit-learn-extra Successfully installed scikit-learn-extra-0.3.0
In [ ]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import pairwise_distances_argmin_min
from scipy.spatial.distance import cdist
import numpy as np
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
import numpy as np
In [ ]:
df = pd.read_excel('/content/K-MEDOIDSs.xlsx')
df = df.drop(['No', 'Nama Nasabah'],axis=1)
df
Out[ ]:
Jumlah Pinjaman | Bunga | Status Pinjaman | |
---|---|---|---|
0 | 70000000 | 5 % | Kurang Lancar |
1 | 50100000 | 5 % | Lancar |
2 | 55000000 | 5 % | Lancar |
3 | 50000000 | 10 % | Lancar |
4 | 61100000 | 5 % | Lancar |
... | ... | ... | ... |
1080 | 55000000 | 5 % | Kurang Lancar |
1081 | 53000000 | 5 % | Lancar |
1082 | 69500000 | 5 % | Lancar |
1083 | 90450000 | 5 % | Kurang Lancar |
1084 | 83000000 | 10 % | Lancar |
1085 rows × 3 columns
Transform data¶
In [ ]:
bunga_encoder = LabelEncoder()
df['Bunga'] = df['Bunga'].str.replace('%', '').astype(float)
df['Bunga'] = bunga_encoder.fit_transform(df['Bunga']) + 1
mapping = {'Tidak Lancar': 1, 'Kurang Lancar': 2, 'Lancar': 3}
df['Status Pinjaman'] = df['Status Pinjaman'].map(mapping)
df
Out[ ]:
Jumlah Pinjaman | Bunga | Status Pinjaman | |
---|---|---|---|
0 | 70000000 | 1 | 2 |
1 | 50100000 | 1 | 3 |
2 | 55000000 | 1 | 3 |
3 | 50000000 | 2 | 3 |
4 | 61100000 | 1 | 3 |
... | ... | ... | ... |
1080 | 55000000 | 1 | 2 |
1081 | 53000000 | 1 | 3 |
1082 | 69500000 | 1 | 3 |
1083 | 90450000 | 1 | 2 |
1084 | 83000000 | 2 | 3 |
1085 rows × 3 columns
Normalisasi¶
In [ ]:
scaler = MinMaxScaler(feature_range=(0, 1))
data = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
data
Out[ ]:
Jumlah Pinjaman | Bunga | Status Pinjaman | |
---|---|---|---|
0 | 0.0400 | 0.0 | 0.5 |
1 | 0.0002 | 0.0 | 1.0 |
2 | 0.0100 | 0.0 | 1.0 |
3 | 0.0000 | 0.5 | 1.0 |
4 | 0.0222 | 0.0 | 1.0 |
... | ... | ... | ... |
1080 | 0.0100 | 0.0 | 0.5 |
1081 | 0.0060 | 0.0 | 1.0 |
1082 | 0.0390 | 0.0 | 1.0 |
1083 | 0.0809 | 0.0 | 0.5 |
1084 | 0.0660 | 0.5 | 1.0 |
1085 rows × 3 columns
In [ ]:
from scipy.stats import zscore
# Mengonversi data menjadi array numpy
data_array = data.to_numpy()
# Deteksi outliers menggunakan Z-score
from scipy import stats
z_scores = np.abs(stats.zscore(data))
outliers = np.where(z_scores > 2) # threshold 2 - 3
# Remove outliers
filtered_data = np.delete(data, outliers[0], axis=0)
data = pd.DataFrame(filtered_data, columns=data.columns)
data.head()
Out[ ]:
Jumlah Pinjaman | Bunga | Status Pinjaman | |
---|---|---|---|
0 | 0.0400 | 0.0 | 0.5 |
1 | 0.0002 | 0.0 | 1.0 |
2 | 0.0100 | 0.0 | 1.0 |
3 | 0.0000 | 0.5 | 1.0 |
4 | 0.0222 | 0.0 | 1.0 |
In [ ]:
Modeling¶
In [ ]:
n_clusters = 5 # <-- UBAH JUMLAH CLUSTER YANG DI INGINKAN (2 - 10)
kmedoids = KMedoids(n_clusters=n_clusters, random_state=42, max_iter=2)
total_simpangan_iterasi_list = []
medoids_list = []
clusterings = []
distances_list = []
centers_list = []
total_distance_difference = 0
medoid_indices_list = []
for _ in range(2):
medoids_indices = np.random.choice(len(data), n_clusters, replace=False)
medoids = data.iloc[medoids_indices]
cluster_labels, distances = pairwise_distances_argmin_min(data, medoids)
total_simpangan_iterasi = 0
for i, medoid_index in enumerate(medoids_indices):
total_simpangan_iterasi += np.sum(np.linalg.norm(data[cluster_labels == i] - data.iloc[medoid_index], axis=1))
total_simpangan_iterasi_list.append(total_simpangan_iterasi)
medoids_list.append(medoids)
clusterings.append(cluster_labels)
distances_list.append(distances)
medoid_indices_list.append(medoids_indices)
cluster_centers = []
for medoid_index in range(len(medoids)):
cluster_data = data[cluster_labels == medoid_index]
cluster_center = cluster_data.mean()
cluster_center['Jumlah Data'] = cluster_data.shape[0]
cluster_centers.append(cluster_center)
centers_list.append(cluster_centers)
for i, (medoids, cluster_labels, distances, centers, medoid_indices) in enumerate(zip(medoids_list, clusterings, distances_list, centers_list, medoid_indices_list), start=1):
print(f"Iterasi {i}:")
centers_df = pd.DataFrame(centers, columns=['Jumlah Pinjaman', 'Bunga', 'Status Pinjaman'])
centers_df.columns = ['X1', 'X2', 'X3']
centers_df['Cluster'] = ['c' + str(i+1) for i in range(n_clusters)]
centers_df.insert(0, 'Data Ke', medoid_indices + 1)
centers_df = centers_df.set_index('Cluster')
print(f"Menentukan Medoid {i} secara acak objek pada masing masing cluster untuk melakukan iterasi ke-{i}:")
print(centers_df)
print()
data_with_info = data.copy()
data_with_info['Jarak Terpendek'] = distances
data_with_info['Clustering'] = ['c' + str(label+1) for label in cluster_labels]
print("Data dengan informasi jarak terpendek dan clustering:")
print(data_with_info.head())
print()
# Hitung DBI untuk setiap cluster
S = []
for k in range(n_clusters):
cluster_data = data[cluster_labels == k]
if cluster_data.shape[0] == 0:
S_k = 0
else:
medoid = data.iloc[medoid_indices[k]]
S_k = np.mean(np.linalg.norm(cluster_data - medoid, axis=1))
S.append(S_k)
M = np.zeros((n_clusters, n_clusters))
for k1 in range(n_clusters):
for k2 in range(n_clusters):
if k1 != k2:
medoid1 = data.iloc[medoid_indices[k1]]
medoid2 = data.iloc[medoid_indices[k2]]
M[k1, k2] = np.linalg.norm(medoid1 - medoid2)
DBI_per_cluster = []
for k in range(n_clusters):
if all(M[k, j] == 0 for j in range(n_clusters) if j != k):
DBI_per_cluster.append(np.nan)
else:
max_ratio = max([(S[k] + S[j]) / M[k, j] if M[k, j] != 0 else np.nan for j in range(n_clusters) if j != k])
DBI_per_cluster.append(max_ratio)
overall_DBI = np.nanmean(DBI_per_cluster)
print(f"DBI Iterasi {i} per Cluster: {DBI_per_cluster}")
print(f"Overall DBI Iterasi {i}: {overall_DBI}\n")
total_distance_difference = total_simpangan_iterasi_list[-1] - total_simpangan_iterasi_list[0]
print("Total Jarak Terpendek dari Masing-masing Iterasi:")
for i, total in enumerate(total_simpangan_iterasi_list, start=1):
print(f"Iterasi {i}: {total}")
print(f"Selisih Total Jarak Terpendek dari Kedua Iterasi: {total_distance_difference}")
Iterasi 1: Menentukan Medoid 1 secara acak objek pada masing masing cluster untuk melakukan iterasi ke-1: Data Ke X1 X2 X3 Cluster c1 145 0.041974 0.5 1.000000 c2 1025 0.039380 0.5 0.288557 c3 200 0.037614 0.0 1.000000 c4 13 0.051439 0.0 0.311364 c5 550 0.009098 0.0 0.272727 Data dengan informasi jarak terpendek dan clustering: Jumlah Pinjaman Bunga Status Pinjaman Jarak Terpendek Clustering 0 0.0400 0.0 0.5 0.0180 c4 1 0.0002 0.0 1.0 0.0218 c3 2 0.0100 0.0 1.0 0.0120 c3 3 0.0000 0.5 1.0 0.0600 c1 4 0.0222 0.0 1.0 0.0002 c3 DBI Iterasi 1 per Cluster: [0.5072604073030141, 0.91656288791305, 0.5155083340158507, 219.803763102466, 219.803763102466] Overall DBI Iterasi 1: 88.30937156683278 Iterasi 2: Menentukan Medoid 2 secara acak objek pada masing masing cluster untuk melakukan iterasi ke-2: Data Ke X1 X2 X3 Cluster c1 458 0.041954 0.000000 0.934053 c2 219 0.003525 0.500000 1.000000 c3 1024 0.050459 0.500000 1.000000 c4 349 0.040364 0.062500 0.006250 c5 380 0.030570 0.276758 0.400612 Data dengan informasi jarak terpendek dan clustering: Jumlah Pinjaman Bunga Status Pinjaman Jarak Terpendek Clustering 0 0.0400 0.0 0.5 0.5000 c5 1 0.0002 0.0 1.0 0.0798 c1 2 0.0100 0.0 1.0 0.0700 c1 3 0.0000 0.5 1.0 0.0030 c2 4 0.0222 0.0 1.0 0.0578 c1 DBI Iterasi 2 per Cluster: [0.6145686043183094, 2.001129817444219, 2.001129817444219, 0.6201076995751492, 0.7224438973088392] Overall DBI Iterasi 2: 1.1918759672181471 Total Jarak Terpendek dari Masing-masing Iterasi: Iterasi 1: 133.04083227116095 Iterasi 2: 173.58952570139616 Selisih Total Jarak Terpendek dari Kedua Iterasi: 40.548693430235204
In [ ]:
# Menggunakan hasil dari iterasi pertama untuk contoh ini
data_iterasi_pertama = data.copy()
data_iterasi_pertama['Jarak Terpendek'] = distances_list[0]
data_iterasi_pertama['Clustering'] = ['c' + str(label+1) for label in clusterings[0]]
# Generate Cluster Label Mapping secara otomatis
cluster_label_mapping = {f'c{i+1}': str(i+1) for i in range(n_clusters)}
# Mengubah cluster label dari 'c1', 'c2', ... menjadi '1', '2', ...
data_iterasi_pertama['Clustering'] = data_iterasi_pertama['Clustering'].replace(cluster_label_mapping)
# Generate Keterangan secara otomatis
reward_values = [30, 20, 10] # Contoh nilai reward, bisa diubah sesuai kebutuhan
keterangan_persentase = {str(i+1): f'Reward {reward_values[i % len(reward_values)]}%' for i in range(n_clusters)}
data_iterasi_pertama['Keterangan'] = data_iterasi_pertama['Clustering'].map(keterangan_persentase)
hasil = pd.concat([data_iterasi_pertama], axis=1)
hasil = hasil[[ 'Clustering', 'Keterangan']]
hasil.head()
Out[ ]:
Clustering | Keterangan | |
---|---|---|
0 | 4 | Reward 30% |
1 | 3 | Reward 10% |
2 | 3 | Reward 10% |
3 | 1 | Reward 30% |
4 | 3 | Reward 10% |
In [ ]:
# Plot penyebaran data clustering dgn 2 dimensi
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Jumlah Pinjaman', y='Bunga', hue='Clustering', data=data_iterasi_pertama, palette='viridis', s=100, alpha=0.6, edgecolor='w')
plt.title('Plot Penyebaran Data Clustering')
plt.xlabel('Jumlah Pinjaman')
plt.ylabel('Bunga')
plt.legend(title='Cluster')
plt.show()
3 Dimensi dengan Atribut Jumlah Pinjaman, Bunga, dan Status Pinjaman
In [ ]:
import plotly.express as px
x_val = 'Jumlah Pinjaman'
y_val = 'Bunga'
z_val = 'Status Pinjaman'
fig = px.scatter_3d(data_iterasi_pertama, x=x_val, y=y_val, z=z_val, color='Clustering', labels='Clustering')
fig.show()
Deteksi Outlier¶
Sebelum Menggunakan K-Medoids
In [ ]:
!pip install hdbscan
Collecting hdbscan Downloading hdbscan-0.8.33.tar.gz (5.2 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.2/5.2 MB 16.0 MB/s eta 0:00:00 Installing build dependencies ... done Getting requirements to build wheel ... done Preparing metadata (pyproject.toml) ... done Collecting cython<3,>=0.27 (from hdbscan) Using cached Cython-0.29.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB) Requirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.25.2) Requirement already satisfied: scipy>=1.0 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.11.4) Requirement already satisfied: scikit-learn>=0.20 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.2.2) Requirement already satisfied: joblib>=1.0 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.4.2) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20->hdbscan) (3.5.0) Building wheels for collected packages: hdbscan Building wheel for hdbscan (pyproject.toml) ... done Created wheel for hdbscan: filename=hdbscan-0.8.33-cp310-cp310-linux_x86_64.whl size=3039283 sha256=9f67f431e49532761d5d469157d8242caeada699a0b5f70f9a5e66c165d6fee2 Stored in directory: /root/.cache/pip/wheels/75/0b/3b/dc4f60b7cc455efaefb62883a7483e76f09d06ca81cf87d610 Successfully built hdbscan Installing collected packages: cython, hdbscan Attempting uninstall: cython Found existing installation: Cython 3.0.10 Uninstalling Cython-3.0.10: Successfully uninstalled Cython-3.0.10 Successfully installed cython-0.29.37 hdbscan-0.8.33
In [ ]:
from hdbscan import HDBSCAN
hdb = HDBSCAN().fit(data)
hdb_label = hdb.labels_
data['cluster_hdb'] = hdb_label
plt.figure(figsize=(12, 8))
sns.scatterplot(data=data, x='Jumlah Pinjaman', y='Bunga', hue='cluster_hdb')
plt.show()
In [ ]:
# Visualization Sebelum menggunakan K-medoids
plt.figure(figsize=(12, 8))
plt.scatter(data.loc[data['cluster_hdb']==-1, 'Jumlah Pinjaman'], data.loc[data['cluster_hdb']==-1, 'Bunga'], c='red', label='Outliers')
plt.scatter(data.loc[data['cluster_hdb']!=-1, 'Jumlah Pinjaman'], data.loc[data['cluster_hdb']!=-1, 'Bunga'], c='grey', label='Inliers')
plt.title('Deteksi Outlier Menggunakan HDBSCAN')
plt.xlabel('Jumlah Pinjaman')
plt.ylabel('Bunga')
plt.legend()
plt.show()
Sesudah Menggunakan K-Medoids
In [ ]:
# Visualization Sesudah menggunakan K-Medoids
plt.figure(figsize=(12, 8))
plt.scatter(data_iterasi_pertama.loc[data_iterasi_pertama['Clustering']==-1, 'Jumlah Pinjaman'], data_iterasi_pertama.loc[data_iterasi_pertama['Clustering']==-1, 'Bunga'], c='red', label='Outliers')
plt.scatter(data_iterasi_pertama.loc[data_iterasi_pertama['Clustering']!=-1, 'Jumlah Pinjaman'], data_iterasi_pertama.loc[data_iterasi_pertama['Clustering']!=-1, 'Bunga'], c='grey', label='Inliers')
plt.title('Deteksi Outlier Menggunakan HDBSCAN')
plt.xlabel('Jumlah Pinjaman')
plt.ylabel('Bunga')
plt.legend()
plt.show()
In [ ]:
# Jumlah dari masing-masing clustering terhadap jumlah pinjaman, bunga, dan Status Pinjaman
df_out = data_iterasi_pertama.groupby(by = 'Clustering').sum()[['Jumlah Pinjaman','Bunga', 'Status Pinjaman']].reset_index()
df_out
Out[ ]:
Clustering | Jumlah Pinjaman | Bunga | Status Pinjaman | |
---|---|---|---|---|
0 | 1 | 7.42936 | 88.5 | 177.0 |
1 | 2 | 7.91538 | 100.5 | 58.0 |
2 | 3 | 13.61632 | 0.0 | 362.0 |
3 | 4 | 11.31660 | 0.0 | 68.5 |
4 | 5 | 1.10090 | 0.0 | 33.0 |
In [ ]:
plt.figure(figsize = (18,4))
plt.subplot(1,3,1)
sns.barplot(x= 'Clustering', y = 'Jumlah Pinjaman', data = df_out)
plt.subplot(1,3,2)
sns.barplot(x= 'Clustering', y = 'Bunga', data = df_out)
plt.subplot(1,3,3)
sns.barplot(x= 'Clustering', y = 'Status Pinjaman', data = df_out)
plt.show()
- Grafik Kiri:
- Sumbu Y: Jumlah Pinjaman.
- Klaster 3 memiliki jumlah pinjaman tertinggi, diikuti oleh klaster 4, klaster 2, klaster 1, dan klaster 5 dengan jumlah pinjaman paling rendah.
- Grafik Tengah:
- Sumbu Y: Bunga.
- Klaster 2 memiliki bunga tertinggi, diikuti oleh klaster 1. Klaster lainnya (3, 4, 5) tidak memiliki bunga yang terdata.
- Grafik Kanan:
- Sumbu Y: Status Pinjaman.
- Klaster 3 memiliki status pinjaman tertinggi, diikuti oleh klaster 1, klaster 4, klaster 2, dan klaster 5 dengan status pinjaman paling rendah.
Setiap grafik menunjukkan distribusi berbagai metrik (Jumlah Pinjaman, Bunga, Status Pinjaman) di antara berbagai klaster (1 hingga 5).