In [ ]:
!pip install scikit-learn-extra
Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 6.4 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn-extra) (1.25.2)
Requirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn-extra) (1.11.4)
Requirement already satisfied: scikit-learn>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn-extra) (1.2.2)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (3.5.0)
Installing collected packages: scikit-learn-extra
Successfully installed scikit-learn-extra-0.3.0
In [ ]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import pairwise_distances_argmin_min
from scipy.spatial.distance import cdist
import numpy as np
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
import numpy as np
In [ ]:
df = pd.read_excel('/content/K-MEDOIDSs.xlsx')
df = df.drop(['No', 'Nama Nasabah'],axis=1)
df
Out[ ]:
Jumlah Pinjaman Bunga Status Pinjaman
0 70000000 5 % Kurang Lancar
1 50100000 5 % Lancar
2 55000000 5 % Lancar
3 50000000 10 % Lancar
4 61100000 5 % Lancar
... ... ... ...
1080 55000000 5 % Kurang Lancar
1081 53000000 5 % Lancar
1082 69500000 5 % Lancar
1083 90450000 5 % Kurang Lancar
1084 83000000 10 % Lancar

1085 rows × 3 columns

Transform data¶

In [ ]:
bunga_encoder = LabelEncoder()

df['Bunga'] = df['Bunga'].str.replace('%', '').astype(float)

df['Bunga'] = bunga_encoder.fit_transform(df['Bunga']) + 1

mapping = {'Tidak Lancar': 1, 'Kurang Lancar': 2, 'Lancar': 3}

df['Status Pinjaman'] = df['Status Pinjaman'].map(mapping)
df
Out[ ]:
Jumlah Pinjaman Bunga Status Pinjaman
0 70000000 1 2
1 50100000 1 3
2 55000000 1 3
3 50000000 2 3
4 61100000 1 3
... ... ... ...
1080 55000000 1 2
1081 53000000 1 3
1082 69500000 1 3
1083 90450000 1 2
1084 83000000 2 3

1085 rows × 3 columns

Normalisasi¶

In [ ]:
scaler = MinMaxScaler(feature_range=(0, 1))
data = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
data
Out[ ]:
Jumlah Pinjaman Bunga Status Pinjaman
0 0.0400 0.0 0.5
1 0.0002 0.0 1.0
2 0.0100 0.0 1.0
3 0.0000 0.5 1.0
4 0.0222 0.0 1.0
... ... ... ...
1080 0.0100 0.0 0.5
1081 0.0060 0.0 1.0
1082 0.0390 0.0 1.0
1083 0.0809 0.0 0.5
1084 0.0660 0.5 1.0

1085 rows × 3 columns

In [ ]:
from scipy.stats import zscore

# Mengonversi data menjadi array numpy
data_array = data.to_numpy()

# Deteksi outliers menggunakan Z-score
from scipy import stats
z_scores = np.abs(stats.zscore(data))
outliers = np.where(z_scores > 2)  # threshold 2 - 3

# Remove outliers
filtered_data = np.delete(data, outliers[0], axis=0)
data = pd.DataFrame(filtered_data, columns=data.columns)
data.head()
Out[ ]:
Jumlah Pinjaman Bunga Status Pinjaman
0 0.0400 0.0 0.5
1 0.0002 0.0 1.0
2 0.0100 0.0 1.0
3 0.0000 0.5 1.0
4 0.0222 0.0 1.0
In [ ]:

Modeling¶

In [ ]:
n_clusters = 5  # <-- UBAH JUMLAH CLUSTER YANG DI INGINKAN (2 - 10)
kmedoids = KMedoids(n_clusters=n_clusters, random_state=42, max_iter=2)
total_simpangan_iterasi_list = []
medoids_list = []
clusterings = []
distances_list = []
centers_list = []
total_distance_difference = 0
medoid_indices_list = []

for _ in range(2):
    medoids_indices = np.random.choice(len(data), n_clusters, replace=False)
    medoids = data.iloc[medoids_indices]
    cluster_labels, distances = pairwise_distances_argmin_min(data, medoids)

    total_simpangan_iterasi = 0
    for i, medoid_index in enumerate(medoids_indices):
        total_simpangan_iterasi += np.sum(np.linalg.norm(data[cluster_labels == i] - data.iloc[medoid_index], axis=1))

    total_simpangan_iterasi_list.append(total_simpangan_iterasi)
    medoids_list.append(medoids)
    clusterings.append(cluster_labels)
    distances_list.append(distances)
    medoid_indices_list.append(medoids_indices)

    cluster_centers = []
    for medoid_index in range(len(medoids)):
        cluster_data = data[cluster_labels == medoid_index]
        cluster_center = cluster_data.mean()
        cluster_center['Jumlah Data'] = cluster_data.shape[0]
        cluster_centers.append(cluster_center)
    centers_list.append(cluster_centers)

for i, (medoids, cluster_labels, distances, centers, medoid_indices) in enumerate(zip(medoids_list, clusterings, distances_list, centers_list, medoid_indices_list), start=1):
    print(f"Iterasi {i}:")
    centers_df = pd.DataFrame(centers, columns=['Jumlah Pinjaman', 'Bunga', 'Status Pinjaman'])
    centers_df.columns = ['X1', 'X2', 'X3']
    centers_df['Cluster'] = ['c' + str(i+1) for i in range(n_clusters)]
    centers_df.insert(0, 'Data Ke', medoid_indices + 1)
    centers_df = centers_df.set_index('Cluster')

    print(f"Menentukan Medoid {i} secara acak objek pada masing masing cluster untuk melakukan iterasi ke-{i}:")
    print(centers_df)
    print()

    data_with_info = data.copy()
    data_with_info['Jarak Terpendek'] = distances
    data_with_info['Clustering'] = ['c' + str(label+1) for label in cluster_labels]
    print("Data dengan informasi jarak terpendek dan clustering:")
    print(data_with_info.head())
    print()

    # Hitung DBI untuk setiap cluster
    S = []
    for k in range(n_clusters):
        cluster_data = data[cluster_labels == k]
        if cluster_data.shape[0] == 0:
            S_k = 0
        else:
            medoid = data.iloc[medoid_indices[k]]
            S_k = np.mean(np.linalg.norm(cluster_data - medoid, axis=1))
        S.append(S_k)

    M = np.zeros((n_clusters, n_clusters))
    for k1 in range(n_clusters):
        for k2 in range(n_clusters):
            if k1 != k2:
                medoid1 = data.iloc[medoid_indices[k1]]
                medoid2 = data.iloc[medoid_indices[k2]]
                M[k1, k2] = np.linalg.norm(medoid1 - medoid2)

    DBI_per_cluster = []
    for k in range(n_clusters):
        if all(M[k, j] == 0 for j in range(n_clusters) if j != k):
            DBI_per_cluster.append(np.nan)
        else:
            max_ratio = max([(S[k] + S[j]) / M[k, j] if M[k, j] != 0 else np.nan for j in range(n_clusters) if j != k])
            DBI_per_cluster.append(max_ratio)
    overall_DBI = np.nanmean(DBI_per_cluster)

    print(f"DBI Iterasi {i} per Cluster: {DBI_per_cluster}")
    print(f"Overall DBI Iterasi {i}: {overall_DBI}\n")

total_distance_difference = total_simpangan_iterasi_list[-1] - total_simpangan_iterasi_list[0]

print("Total Jarak Terpendek dari Masing-masing Iterasi:")
for i, total in enumerate(total_simpangan_iterasi_list, start=1):
    print(f"Iterasi {i}: {total}")

print(f"Selisih Total Jarak Terpendek dari Kedua Iterasi: {total_distance_difference}")
Iterasi 1:
Menentukan Medoid 1 secara acak objek pada masing masing cluster untuk melakukan iterasi ke-1:
         Data Ke        X1   X2        X3
Cluster                                  
c1           145  0.041974  0.5  1.000000
c2          1025  0.039380  0.5  0.288557
c3           200  0.037614  0.0  1.000000
c4            13  0.051439  0.0  0.311364
c5           550  0.009098  0.0  0.272727

Data dengan informasi jarak terpendek dan clustering:
   Jumlah Pinjaman  Bunga  Status Pinjaman  Jarak Terpendek Clustering
0           0.0400    0.0              0.5           0.0180         c4
1           0.0002    0.0              1.0           0.0218         c3
2           0.0100    0.0              1.0           0.0120         c3
3           0.0000    0.5              1.0           0.0600         c1
4           0.0222    0.0              1.0           0.0002         c3

DBI Iterasi 1 per Cluster: [0.5072604073030141, 0.91656288791305, 0.5155083340158507, 219.803763102466, 219.803763102466]
Overall DBI Iterasi 1: 88.30937156683278

Iterasi 2:
Menentukan Medoid 2 secara acak objek pada masing masing cluster untuk melakukan iterasi ke-2:
         Data Ke        X1        X2        X3
Cluster                                       
c1           458  0.041954  0.000000  0.934053
c2           219  0.003525  0.500000  1.000000
c3          1024  0.050459  0.500000  1.000000
c4           349  0.040364  0.062500  0.006250
c5           380  0.030570  0.276758  0.400612

Data dengan informasi jarak terpendek dan clustering:
   Jumlah Pinjaman  Bunga  Status Pinjaman  Jarak Terpendek Clustering
0           0.0400    0.0              0.5           0.5000         c5
1           0.0002    0.0              1.0           0.0798         c1
2           0.0100    0.0              1.0           0.0700         c1
3           0.0000    0.5              1.0           0.0030         c2
4           0.0222    0.0              1.0           0.0578         c1

DBI Iterasi 2 per Cluster: [0.6145686043183094, 2.001129817444219, 2.001129817444219, 0.6201076995751492, 0.7224438973088392]
Overall DBI Iterasi 2: 1.1918759672181471

Total Jarak Terpendek dari Masing-masing Iterasi:
Iterasi 1: 133.04083227116095
Iterasi 2: 173.58952570139616
Selisih Total Jarak Terpendek dari Kedua Iterasi: 40.548693430235204
In [ ]:
# Menggunakan hasil dari iterasi pertama untuk contoh ini
data_iterasi_pertama = data.copy()
data_iterasi_pertama['Jarak Terpendek'] = distances_list[0]
data_iterasi_pertama['Clustering'] = ['c' + str(label+1) for label in clusterings[0]]

# Generate Cluster Label Mapping secara otomatis
cluster_label_mapping = {f'c{i+1}': str(i+1) for i in range(n_clusters)}

# Mengubah cluster label dari 'c1', 'c2', ... menjadi '1', '2', ...
data_iterasi_pertama['Clustering'] = data_iterasi_pertama['Clustering'].replace(cluster_label_mapping)

# Generate Keterangan secara otomatis
reward_values = [30, 20, 10]  # Contoh nilai reward, bisa diubah sesuai kebutuhan
keterangan_persentase = {str(i+1): f'Reward {reward_values[i % len(reward_values)]}%' for i in range(n_clusters)}

data_iterasi_pertama['Keterangan'] = data_iterasi_pertama['Clustering'].map(keterangan_persentase)
hasil = pd.concat([data_iterasi_pertama], axis=1)

hasil = hasil[[ 'Clustering', 'Keterangan']]
hasil.head()
Out[ ]:
Clustering Keterangan
0 4 Reward 30%
1 3 Reward 10%
2 3 Reward 10%
3 1 Reward 30%
4 3 Reward 10%
In [ ]:
# Plot penyebaran data clustering dgn 2 dimensi
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.scatterplot(x='Jumlah Pinjaman', y='Bunga', hue='Clustering', data=data_iterasi_pertama, palette='viridis', s=100, alpha=0.6, edgecolor='w')

plt.title('Plot Penyebaran Data Clustering')
plt.xlabel('Jumlah Pinjaman')
plt.ylabel('Bunga')
plt.legend(title='Cluster')
plt.show()
No description has been provided for this image

3 Dimensi dengan Atribut Jumlah Pinjaman, Bunga, dan Status Pinjaman

In [ ]:
import plotly.express as px

x_val = 'Jumlah Pinjaman'
y_val = 'Bunga'
z_val = 'Status Pinjaman'

fig = px.scatter_3d(data_iterasi_pertama, x=x_val, y=y_val, z=z_val, color='Clustering', labels='Clustering')
fig.show()

Deteksi Outlier¶

Sebelum Menggunakan K-Medoids

In [ ]:
!pip install hdbscan
Collecting hdbscan
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.2/5.2 MB 16.0 MB/s eta 0:00:00
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
  Preparing metadata (pyproject.toml) ... done
Collecting cython<3,>=0.27 (from hdbscan)
  Using cached Cython-0.29.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
Requirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.25.2)
Requirement already satisfied: scipy>=1.0 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.11.4)
Requirement already satisfied: scikit-learn>=0.20 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.2.2)
Requirement already satisfied: joblib>=1.0 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20->hdbscan) (3.5.0)
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml) ... done
  Created wheel for hdbscan: filename=hdbscan-0.8.33-cp310-cp310-linux_x86_64.whl size=3039283 sha256=9f67f431e49532761d5d469157d8242caeada699a0b5f70f9a5e66c165d6fee2
  Stored in directory: /root/.cache/pip/wheels/75/0b/3b/dc4f60b7cc455efaefb62883a7483e76f09d06ca81cf87d610
Successfully built hdbscan
Installing collected packages: cython, hdbscan
  Attempting uninstall: cython
    Found existing installation: Cython 3.0.10
    Uninstalling Cython-3.0.10:
      Successfully uninstalled Cython-3.0.10
Successfully installed cython-0.29.37 hdbscan-0.8.33
In [ ]:
from hdbscan import HDBSCAN

hdb = HDBSCAN().fit(data)
hdb_label = hdb.labels_

data['cluster_hdb'] = hdb_label

plt.figure(figsize=(12, 8))
sns.scatterplot(data=data, x='Jumlah Pinjaman', y='Bunga', hue='cluster_hdb')
plt.show()
No description has been provided for this image
In [ ]:
# Visualization Sebelum menggunakan K-medoids
plt.figure(figsize=(12, 8))
plt.scatter(data.loc[data['cluster_hdb']==-1, 'Jumlah Pinjaman'], data.loc[data['cluster_hdb']==-1, 'Bunga'], c='red', label='Outliers')
plt.scatter(data.loc[data['cluster_hdb']!=-1, 'Jumlah Pinjaman'], data.loc[data['cluster_hdb']!=-1, 'Bunga'], c='grey', label='Inliers')
plt.title('Deteksi Outlier Menggunakan HDBSCAN')
plt.xlabel('Jumlah Pinjaman')
plt.ylabel('Bunga')
plt.legend()
plt.show()
No description has been provided for this image

Sesudah Menggunakan K-Medoids

In [ ]:
# Visualization Sesudah menggunakan K-Medoids
plt.figure(figsize=(12, 8))
plt.scatter(data_iterasi_pertama.loc[data_iterasi_pertama['Clustering']==-1, 'Jumlah Pinjaman'], data_iterasi_pertama.loc[data_iterasi_pertama['Clustering']==-1, 'Bunga'], c='red', label='Outliers')
plt.scatter(data_iterasi_pertama.loc[data_iterasi_pertama['Clustering']!=-1, 'Jumlah Pinjaman'], data_iterasi_pertama.loc[data_iterasi_pertama['Clustering']!=-1, 'Bunga'], c='grey', label='Inliers')
plt.title('Deteksi Outlier Menggunakan HDBSCAN')
plt.xlabel('Jumlah Pinjaman')
plt.ylabel('Bunga')
plt.legend()
plt.show()
No description has been provided for this image
In [ ]:
# Jumlah dari masing-masing clustering terhadap jumlah pinjaman, bunga, dan Status Pinjaman

df_out = data_iterasi_pertama.groupby(by = 'Clustering').sum()[['Jumlah Pinjaman','Bunga', 'Status Pinjaman']].reset_index()
df_out
Out[ ]:
Clustering Jumlah Pinjaman Bunga Status Pinjaman
0 1 7.42936 88.5 177.0
1 2 7.91538 100.5 58.0
2 3 13.61632 0.0 362.0
3 4 11.31660 0.0 68.5
4 5 1.10090 0.0 33.0
In [ ]:
plt.figure(figsize = (18,4))
plt.subplot(1,3,1)
sns.barplot(x= 'Clustering', y = 'Jumlah Pinjaman', data = df_out)
plt.subplot(1,3,2)
sns.barplot(x= 'Clustering', y = 'Bunga', data = df_out)
plt.subplot(1,3,3)
sns.barplot(x= 'Clustering', y = 'Status Pinjaman', data = df_out)
plt.show()
No description has been provided for this image
  1. Grafik Kiri:
  • Sumbu Y: Jumlah Pinjaman.
  • Klaster 3 memiliki jumlah pinjaman tertinggi, diikuti oleh klaster 4, klaster 2, klaster 1, dan klaster 5 dengan jumlah pinjaman paling rendah.
  1. Grafik Tengah:
  • Sumbu Y: Bunga.
  • Klaster 2 memiliki bunga tertinggi, diikuti oleh klaster 1. Klaster lainnya (3, 4, 5) tidak memiliki bunga yang terdata.
  1. Grafik Kanan:
  • Sumbu Y: Status Pinjaman.
  • Klaster 3 memiliki status pinjaman tertinggi, diikuti oleh klaster 1, klaster 4, klaster 2, dan klaster 5 dengan status pinjaman paling rendah.

Setiap grafik menunjukkan distribusi berbagai metrik (Jumlah Pinjaman, Bunga, Status Pinjaman) di antara berbagai klaster (1 hingga 5).