Last Updated on June 4, 2024 by Editorial Team Author(s): Greg Postalian-Yrausquin Originally published on Towards AI. The data used is tricky because it is a list of Spotify songs, which are assigned values that describe the sounds in them. At this point, the goal is to see if those descriptions can be used to identify the music genre or the artist. import numpy as npimport pandas as pdfrom sklearn.decomposition import PCAfrom sklearn.cluster import KMeansfrom sklearn.cluster import AgglomerativeClusteringfrom sklearn.cluster import Birchfrom sklearn.cluster import OPTICSfrom sklearn.cluster import DBSCANfrom sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabasz_scorefrom sklearn.preprocessing import StandardScalerimport matplotlib.pyplot as pltimport matplotlib.cm as cmimport seaborn as snsimport warningsimport ast Load dataset and show a quick description and sample dataset = pd.read_csv("playlist_2010to2022.csv")print(dataset.info())print(dataset.isna().sum())print(dataset.describe())dataset.head(10) From this dataset I will select the sound attributes of the songs. I am also saving the metadata appart dataset = dataset.set_index('track_id')dataset = dataset.dropna()metadata = dataset[['track_name','album','artist_id','artist_name','artist_genres','year']]dataset = dataset[['danceability','energy','key','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']] The following code defines a PCA analysis to reduce the number of variables (might sound as an overkill for this example, but it also serves the purpose to scale the variables properly. In my experience clustering sometimes works better working with principal components than with the actual values). Before the PCA a standard scaler is also run for each column def varred(simio): scaler = PCA(n_components=0.85, svd_solver='full') resultsWordstrans = simio.copy() resultsWordstrans = scaler.fit_transform(resultsWordstrans) resultsWordstrans = pd.DataFrame(resultsWordstrans) resultsWordstrans.index = simio.index resultsWordstrans.columns = resultsWordstrans.columns.astype(str) return resultsWordstransdef properscaler(simio): scaler = StandardScaler() resultsWordstrans = scaler.fit_transform(simio) resultsWordstrans = pd.DataFrame(resultsWordstrans) resultsWordstrans.index = simio.index resultsWordstrans.columns = simio.columns return resultsWordstransdatasetR = properscaler(dataset)datasetR = varred(datasetR) First, I will review the metrics, setting in maximum of 20 clusters to determine the number of clusters using the silhouette and the Calinski Harabasz score. a = []X = datasetR.to_numpy(dtype='float')for ncl in np.arange(2, int(20), 1): clusterer = AgglomerativeClustering(n_clusters=int(ncl)) cluster_labels1 = clusterer.fit_predict(X) silhouette_avg1 = silhouette_score(X, cluster_labels1) calinski1 = calinski_harabasz_score(X, cluster_labels1) clusterer = KMeans(n_clusters=int(ncl)) with warnings.catch_warnings(): warnings.simplefilter("ignore") cluster_labels2 = clusterer.fit_predict(X) silhouette_avg2 = silhouette_score(X, cluster_labels2) calinski2 = calinski_harabasz_score(X, cluster_labels2) clusterer = Birch(n_clusters=int(ncl)) cluster_labels3 = clusterer.fit_predict(X) silhouette_avg3 = silhouette_score(X, cluster_labels3) calinski3 = calinski_harabasz_score(X, cluster_labels3) row = pd.DataFrame({"ncl": [ncl], "silAggCl": [silhouette_avg1], "c_hAggCl": [calinski1], "silKMeans": [silhouette_avg2], "c_hKMeans": [calinski2], "silBirch": [silhouette_avg3], "c_hBirch": [calinski3], }) a.append(row)scores = pd.concat(a, ignore_index=True)plt.style.use('bmh')fig, [ax_sil, ax_ch] = plt.subplots(1,2,figsize=(15,7))ax_sil.plot(scores["ncl"], scores["silAggCl"], 'g-')ax_sil.plot(scores["ncl"], scores["silKMeans"], 'b-')ax_sil.plot(scores["ncl"], scores["silBirch"], 'r-')ax_ch.plot(scores["ncl"], scores["c_hAggCl"], 'g-', label='Agg Clust')ax_ch.plot(scores["ncl"], scores["c_hKMeans"], 'b-', label='KMeans')ax_ch.plot(scores["ncl"], scores["c_hBirch"], 'r-', label='Birch')ax_sil.set_title("Silhouette curves")ax_ch.set_title("Calinski Harabasz curves")ax_sil.set_xlabel('clusters')ax_sil.set_ylabel('silhouette_avg')ax_ch.set_xlabel('clusters')ax_ch.set_ylabel('calinski_harabasz')ax_ch.legend(loc="upper right")plt.show() ncl_AggCl = 11ncl_KMeans = 17ncl_Birch = 9X = datasetR.to_numpy(dtype='float')clusterer1 = AgglomerativeClustering(n_clusters=int(ncl_AggCl))cluster_labels1 = clusterer1.fit_predict(X)n_clusters1 = max(cluster_labels1)silhouette_avg1 = silhouette_score(X, cluster_labels1)sample_silhouette_values1 = silhouette_samples(X, cluster_labels1)with warnings.catch_warnings(): warnings.simplefilter("ignore") clusterer2 = KMeans(n_clusters=int(ncl_KMeans)) cluster_labels2 = clusterer2.fit_predict(X)n_clusters2 = max(cluster_labels2)silhouette_avg2 = silhouette_score(X, cluster_labels2)sample_silhouette_values2 = silhouette_samples(X, cluster_labels2)clusterer3 = Birch(n_clusters=int(ncl_Birch))cluster_labels3 = clusterer3.fit_predict(X)n_clusters3 = max(cluster_labels3)silhouette_avg3 = silhouette_score(X, cluster_labels3)sample_silhouette_values3 = silhouette_samples(X, cluster_labels3)clusterer4 = OPTICS(min_samples=2)cluster_labels4 = clusterer4.fit_predict(X)n_clusters4 = max(cluster_labels4)silhouette_avg4 = silhouette_score(X, cluster_labels4)sample_silhouette_values4 = silhouette_samples(X, cluster_labels4)clusterer5 = DBSCAN(eps=1, min_samples=2)cluster_labels5 = clusterer5.fit_predict(X)n_clusters5 = max(cluster_labels5)silhouette_avg5 = silhouette_score(X, cluster_labels5)sample_silhouette_values5 = silhouette_samples(X, cluster_labels5)finalDF = datasetR.copy()finalDF["clAggCl"] = cluster_labels1finalDF["clKMeans"] = cluster_labels2finalDF["clBirch"] = cluster_labels3finalDF["clOptics"] = cluster_labels4finalDF["clDbscan"] = cluster_labels5finalDF["silAggCl"] = sample_silhouette_values1finalDF["silKMeans"] = sample_silhouette_values2finalDF["silBirch"] = sample_silhouette_values3finalDF["silOptics"] = sample_silhouette_values4finalDF["silDbscan"] = sample_silhouette_values5finalDFf = pd.merge(finalDF, metadata, left_index=True, right_index=True)finalDFf['artist_genres'] = finalDFf['artist_genres'].apply(lambda x: ast.literal_eval(x))fig, [ax1, ax2, ax3, ax4, ax5] = plt.subplots(1,5,figsize=(20,20))ax1.set_xlim([-0.1, 1])ax1.set_ylim([0, len(X) + (n_clusters1 + 1) 10])y_lower = 10for i in range(min(cluster_labels1),max(cluster_labels1)+1): ith_cluster_silhouette_values = sample_silhouette_values1[cluster_labels1 == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters1) ax1.fill_betweenx( np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7, ) ax1.text(-0.05, y_lower + 0.5 size_cluster_i, str(i)) y_lower = y_upper + 10 # 10 for the 0 samplesax1.set_title("Silhouette plot for Agg. Clustering")ax1.set_xlabel("Silhouette coefficient values")ax1.set_ylabel("Cluster labels")ax1.axvline(x=silhouette_avg1, color="red", linestyle="--")ax1.set_yticks([]) # Clear the yaxis labels / ticksax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])ax2.set_xlim([-0.1, 1])ax2.set_ylim([0, len(X) + (n_clusters2 + 1) 10])y_lower = 10for i in range(min(cluster_labels2),max(cluster_labels2)+1): ith_cluster_silhouette_values = sample_silhouette_values2[cluster_labels2 == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters2) ax2.fill_betweenx( np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7, ) ax2.text(-0.05, y_lower + 0.5 size_cluster_i, str(i)) y_lower = y_upper + 10 # 10 for the 0 samplesax2.set_title("Silhouette plot for KMeans")ax2.set_xlabel("Silhouette coefficient values")ax2.set_ylabel("Cluster labels")ax2.axvline(x=silhouette_avg2, color="red", linestyle="--")ax2.set_yticks([]) # Clear the yaxis labels / ticksax2.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])ax3.set_xlim([-0.1, 1])ax3.set_ylim([0, len(X) + (n_clusters3 + 1) 10])y_lower = 10for i in range(min(cluster_labels3),max(cluster_labels3)+1): ith_cluster_silhouette_values = sample_silhouette_values3[cluster_labels3 == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters3) ax3.fill_betweenx( np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7, ) ax3.text(-0.05, y_lower + 0.5 size_cluster_i, str(i)) y_lower = y_upper + 10 # 10 for the 0 samplesax3.set_title("Silhouette plot for Birch")ax3.set_xlabel("Silhouette coefficient values")ax3.set_ylabel("Cluster labels")ax3.axvline(x=silhouette_avg3, color="red", linestyle="--")ax3.set_yticks([]) # Clear the yaxis labels / ticksax3.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])ax4.set_xlim([-0.1, 1])ax4.set_ylim([0, len(X) + (n_clusters4 + 1) 10])y_lower = 10for i in range(min(cluster_labels4),max(cluster_labels4)+1): ith_cluster_silhouette_values = sample_silhouette_values4[cluster_labels4 == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters4) ax4.fill_betweenx( np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7, ) ax4.text(-0.05, y_lower + 0.5 size_cluster_i, str(i)) y_lower = y_upper + 10 # 10 for the 0 samplesax4.set_title("Silhouette plot for Optics")ax4.set_xlabel("Silhouette coefficient values")ax4.set_ylabel("Cluster labels")ax4.axvline(x=silhouette_avg4, color="red", linestyle="--")ax4.set_yticks([]) # Clear the yaxis labels / ticksax4.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])ax5.set_xlim([-0.1, 1])ax5.set_ylim([0, len(X) + (n_clusters5 + 1) 10])y_lower = 10for i in range(min(cluster_labels5),max(cluster_labels5)+1): ith_cluster_silhouette_values = sample_silhouette_values5[cluster_labels5 == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters5) ax5.fill_betweenx( np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7, ) ax5.text(-0.05, y_lower + 0.5 size_cluster_i, str(i)) y_lower = y_upper + 10 # 10 for the 0 samplesax5.set_title("Silhouette plot for DBScan")ax5.set_xlabel("Silhouette coefficient values")ax5.set_ylabel("Cluster labels")ax5.axvline(x=silhouette_avg5, color="red", linestyle="--")ax5.set_yticks([]) # Clear the yaxis labels / ticksax5.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) Since I need to select a genre for the next part of the exercise I need first to undo that column, which is a list, but it is written as text, and I will count the number of words in a genre, to remove the composed ones. This will show the unique values of genres. finalDFf = pd.merge(finalDF, metadata, left_index=True, right_index=True)finalDFf['artist_genres'] = finalDFf['artist_genres'].apply(lambda x: ast.literal_eval(x))genres = pd.DataFrame(finalDFf['artist_genres'].explode())finalDFgen = pd.merge(finalDF, genres, left_index=True, right_index=True)finalDFgen = finalDFgen.drop_duplicates()genrestbl = pd.DataFrame(finalDFgen.groupby('artist_genres')['artist_genres'].count()).reset_index(names="genre").sort_values(['artist_genres'], ascending=False)print(genrestbl.head(100).to_string()) List of genres to select on the next section selectedgenres = ['rock','reggaeton','house','hip pop','electro house','trap latino', 'punk', 'nu metal', 'pop dance']filtered = finalDFgen[finalDFgen['artist_genres'].isin(selectedgenres)]fig, [ax1, ax2, ax3, ax4, ax5] = plt.subplots(5,1, figsize=(10,20))sns.scatterplot(data=filtered, x="0", y="1", […]