def minDistance(dataSet, centroidList):
# 对每个属于dataSet的item, 计算item与centroidList中k个质心的距离,找出距离最小的,并将item加入相应的簇类中
clusterDict = dict() #dict保存簇类结果
k = len(centroidList)
for item in dataSet:
vec1 = item
flag = -1
minDis = float("inf") # 初始化为最大值for i in range(k):
vec2 = centroidList[i]
distance = calcuDistance(vec1, vec2) # errorif distance < minDis:
minDis = distance
flag = i # 循环结束时, flag保存与当前item最近的蔟标记if flag not in clusterDict.keys():
clusterDict.setdefault(flag, [])
clusterDict[flag].append(item) #加入相应的类别中return clusterDict #不同的类别
def getCentroids(clusterDict):
#重新计算k个质心
centroidList = []
for key in clusterDict.keys():
centroid = np.mean(clusterDict[key], axis=0)
centroidList.append(centroid)
return centroidList #得到新的质心
计算计算各蔟集合间的均方误差,来衡量聚类的效果
def getVar(centroidList, clusterDict):
# 计算各蔟集合间的均方误差# 将蔟类中各个向量与质心的距离累加求和
sum = 0.0
for key in clusterDict.keys():
vec1 = centroidList[key]
distance = 0.0
for item in clusterDict[key]:
vec2 = item
distance += calcuDistance(vec1, vec2)
sum += distance
return sum