我正在尝试调整一种算法,这在文章中找到
https://www.alessiovaccaro.com/resources/kmeans.php#5.-Modellazione-con-K-means
研究通过无人机测量获得的点云的行为。我不是很有经验,但我不知道如何修复它在第51行返回的错误。我附上了代码和开始的txt文件(
https://drive.google.com/file/d/1AD6O7m8yAsTt7UnVh9MQlNpAXuoVtXdW/view?usp=sharing
). 我怎样才能解决这个问题?
import pandas as pd
import numpy as np
import seaborn as sns # For data visualization and specifically for pairplot()
import matplotlib.pyplot as plt # For data visualization
from sklearn.preprocessing import StandardScaler # To transform the dataset
from sklearn.cluster import KMeans # To instantiate, train and use model
from sklearn import metrics # For Model Evaluation
#Input nuvola di punti e assegnazione
point_cloud= np.loadtxt(r'D:/Tesi/K-means/Nuvola_prova.txt',skiprows=2)
#Creazione dataframe
dataframe=pd.DataFrame(list(zip(point_cloud[:,0],point_cloud[:,1],point_cloud[:,2],point_cloud[:,3],
point_cloud[:,4], point_cloud[:,5], point_cloud[:,11], point_cloud[:,13],
point_cloud[:,14])), columns=['X', 'Y','Z','R', 'G', 'B', 'Intensity', 'Density', 'NCR'])
#Calcolo matrice di correlazione
dataframe.corr()
plt.figure(figsize = (15,6))
sns.heatmap( dataframe.corr(), annot=True)
#Pairplot (per vedere a grafico tutti gli accoppiamenti)
sns.pairplot(dataframe)
plt.figure(figsize = (45,18))
plt.show()
#Boxplot (distribuzione di più variabili contemporaneamente)
plt.figure(figsize = (15,4))
sns.boxplot(data = dataframe, orient = "h")
plt.show()
#Standardizzazione del dataset per evitare problemi di visualizzazione
scaler = StandardScaler()
scaled_array = scaler.fit_transform(dataframe)
scaled_dataframe = pd.DataFrame( scaled_array, columns = dataframe.columns )
plt.figure(figsize = (15,4))
sns.boxplot(data = scaled_dataframe, orient = "h")
plt.show()
scaled_dataframe.describe()
#Modellazione con K-means
#Decisione del numero di cluster di partenza
kmeans_model = KMeans(n_clusters = 4)
#Fitting
kmeans_model.fit(scaled_dataframe)
centroids = kmeans_model.cluster_centers_
centroids
kmeans_model.cluster_centers_.shape
#Visualizzazione risultati
kmeans_model.labels_
dataframe["cluster"] = kmeans_model.labels_
dataframe
sns.pairplot(data = dataframe, hue = "cluster", palette = "Accent_r")
plt.show()
#Ricerca del numero ottimale di cluster tramite indice di silhouette
#Si itera da 2 a 100 cluster e si salva l'indice in un dizionario
k_to_test = range(2,100,1)
silhouette_scores = {}
for k in k_to_test:
model_kmeans_k = KMeans( n_clusters = k )
model_kmeans_k.fit(scaled_dataframe.drop("cluster", axis = 1))
labels_k = model_kmeans_k.labels_
score_k = metrics.silhouette_score(scaled_dataframe.drop("cluster", axis=1), labels_k)
silhouette_scores[k] = score_k
print("Tested kMeans with k = %d\tSS: %5.4f" % (k, score_k))
print("Done!")
#Si graficano i risultati
plt.figure(figsize = (16,5))
plt.plot(silhouette_scores.values())
plt.xticks(range(0,23,1), silhouette_scores.keys())
plt.title("Silhouette Metric")
plt.xlabel("k")
plt.ylabel("Silhouette")
plt.axvline(1, color = "r")
plt.show()
File "D:\Tesi\K-means\Main.py", line 52, in <module>
kmeans_model.fit(scaled_dataframe)
File "D:\Anaconda\envs\py39\lib\site-packages\sklearn\cluster\_kmeans.py", line 1137, in fit
X = self._validate_data(
File "D:\Anaconda\envs\py39\lib\site-packages\sklearn\base.py", line 566, in _validate_data
X = check_array(X, **check_params)
File "D:\Anaconda\envs\py39\lib\site-packages\sklearn\utils\validation.py", line 800, in check_array
_assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
File "D:\Anaconda\envs\py39\lib\site-packages\sklearn\utils\validation.py", line 114, in _assert_all_finite
raise ValueError(
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
我认为这与一个事实有关,在其中一列中有非数值(不是数字),这会让一切陷入危机。