本文主要介绍python中实现聚类算法的小实例。在python中主要使用scikit-learn模块中的相关方法实现聚类算法。
http://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "abdata"
import time
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
# from sklearn import datasets
# import matplotlib.pyplot as plt
mdata = pd.read_csv('http://data.galaxystatistics.com/blog_data/regression/iris.csv')
X = mdata.iloc[:,2:6]
print(X.head())
print(X.shape)
print(type(X))
t0 = time.time()
model = AgglomerativeClustering(n_clusters=3,affinity='euclidean',linkage='ward',compute_full_tree='false').fit(X)
t = time.time() - t0
print(t)
print(model.labels_)
print(model.n_leaves_)
print(model.n_components_)
print(model.children_)
print(type(model))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 5.1 3.5 1.4 0.2
## 1 4.9 3.0 1.4 0.2
## 2 4.7 3.2 1.3 0.2
## 3 4.6 3.1 1.5 0.2
## 4 5.0 3.6 1.4 0.2
## (150, 4)
## <class 'pandas.core.frame.DataFrame'>
## 0.020556211471557617
## [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
## 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 0 2 2 2 0 2 2 2 0 2 2 2 0 2
## 2 0]
## 150
## 1
## [[101 142]
## [ 7 39]
## [ 0 17]
## [ 9 34]
## [128 132]
## [ 10 48]
## [ 4 37]
## [ 19 21]
## [ 29 30]
## [ 57 93]
## [ 80 81]
## [116 137]
## [ 8 38]
## [ 27 28]
## [ 3 47]
## [ 82 92]
## [ 95 96]
## [127 138]
## [ 1 45]
## [ 63 91]
## [ 65 75]
## [ 40 152]
## [123 126]
## [ 49 151]
## [112 139]
## [ 94 99]
## [ 12 168]
## [ 88 166]
## [ 66 84]
## [ 23 26]
## [ 53 89]
## [ 74 97]
## [ 25 153]
## [ 46 157]
## [ 2 164]
## [110 147]
## [120 143]
## [136 148]
## [ 78 169]
## [ 69 160]
## [ 54 58]
## [140 144]
## [141 145]
## [ 43 179]
## [ 68 87]
## [ 50 52]
## [ 51 56]
## [107 130]
## [105 122]
## [103 161]
## [163 171]
## [ 20 31]
## [ 11 158]
## [ 67 165]
## [ 70 167]
## [ 42 162]
## [113 150]
## [ 6 184]
## [173 200]
## [ 55 90]
## [176 182]
## [ 86 195]
## [124 186]
## [ 83 133]
## [ 5 18]
## [ 13 205]
## [175 177]
## [ 32 33]
## [125 129]
## [104 154]
## [ 73 188]
## [149 204]
## [146 172]
## [121 206]
## [ 36 155]
## [ 76 190]
## [115 187]
## [ 61 71]
## [156 208]
## [ 72 213]
## [117 131]
## [191 212]
## [ 24 202]
## [ 98 159]
## [ 16 224]
## [ 35 210]
## [ 64 79]
## [ 85 196]
## [ 77 185]
## [ 44 183]
## [111 199]
## [180 189]
## [102 218]
## [174 192]
## [181 227]
## [170 225]
## [118 198]
## [ 14 15]
## [178 209]
## [222 229]
## [201 234]
## [114 223]
## [ 60 233]
## [217 247]
## [ 59 241]
## [207 232]
## [197 242]
## [ 62 203]
## [214 250]
## [119 194]
## [100 226]
## [108 219]
## [216 248]
## [211 245]
## [240 261]
## [193 239]
## [109 135]
## [235 255]
## [238 243]
## [236 254]
## [ 22 215]
## [220 244]
## [228 265]
## [257 269]
## [134 249]
## [221 237]
## [231 260]
## [ 41 270]
## [230 266]
## [106 262]
## [253 258]
## [259 274]
## [267 277]
## [264 268]
## [271 275]
## [246 278]
## [251 281]
## [276 283]
## [256 285]
## [273 279]
## [272 280]
## [263 284]
## [252 289]
## [286 291]
## [282 290]
## [287 288]
## [292 293]
## [295 296]
## [294 297]]
## <class 'sklearn.cluster.hierarchical.AgglomerativeClustering'>
http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "abdata"
import time
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
# from sklearn import datasets
# import matplotlib.pyplot as plt
mdata = pd.read_csv('http://data.galaxystatistics.com/blog_data/regression/iris.csv')
X = mdata.iloc[:,2:6]
print(X.head())
print(X.shape)
print(type(X))
t0 = time.time()
model = KMeans(init='k-means++',n_clusters=3, random_state=8).fit(X)
t = time.time() - t0
print(t)
print(model.labels_)
print(model.cluster_centers_)
print(model.inertia_)
print(type(model))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 5.1 3.5 1.4 0.2
## 1 4.9 3.0 1.4 0.2
## 2 4.7 3.2 1.3 0.2
## 3 4.6 3.1 1.5 0.2
## 4 5.0 3.6 1.4 0.2
## (150, 4)
## <class 'pandas.core.frame.DataFrame'>
## 0.02309250831604004
## [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
## 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
## 2 0]
## [[ 5.9016129 2.7483871 4.39354839 1.43387097]
## [ 5.006 3.428 1.462 0.246 ]
## [ 6.85 3.07368421 5.74210526 2.07105263]]
## 78.8514414261
## <class 'sklearn.cluster.k_means_.KMeans'>
http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "abdata"
import time
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
# from sklearn import datasets
# import matplotlib.pyplot as plt
mdata = pd.read_csv('http://data.galaxystatistics.com/blog_data/regression/iris.csv')
X = mdata.iloc[:,2:6]
print(X.head())
print(X.shape)
print(type(X))
t0 = time.time()
model = DBSCAN(eps=0.3, min_samples=3).fit(X)
t = time.time() - t0
print(t)
print(model.labels_)
print(model.core_sample_indices_)
print(model.components_)
print(type(model))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 5.1 3.5 1.4 0.2
## 1 4.9 3.0 1.4 0.2
## 2 4.7 3.2 1.3 0.2
## 3 4.6 3.1 1.5 0.2
## 4 5.0 3.6 1.4 0.2
## (150, 4)
## <class 'pandas.core.frame.DataFrame'>
## 0.002005338668823242
## [ 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 -1 -1 -1 0 -1 0 -1 0 -1 0 0
## 0 0 0 0 0 0 -1 -1 -1 0 0 0 0 0 0 0 -1 0 0 -1 0 0 0 0 0
## 1 -1 1 4 2 -1 -1 -1 2 -1 -1 -1 -1 3 -1 2 -1 4 -1 4 5 -1 -1 3 2
## 2 -1 -1 3 -1 4 4 4 -1 -1 -1 1 -1 4 4 4 3 4 -1 4 4 4 2 -1 4
## -1 6 -1 7 -1 -1 -1 -1 -1 -1 -1 -1 -1 6 -1 -1 7 -1 -1 -1 8 -1 -1 5 8
## -1 5 5 -1 -1 -1 -1 -1 -1 -1 -1 -1 7 5 -1 8 -1 6 8 8 -1 5 -1 -1 5]
## [ 0 1 2 3 4 6 7 8 9 10 11 12 17 19 21 23 25 26
## 27 28 29 30 34 35 37 38 39 40 42 43 45 46 47 48 49 52
## 58 63 67 69 70 74 75 78 80 81 82 88 89 91 92 94 95 96
## 99 101 103 113 116 120 123 126 127 137 138 140 142]
## [[ 5.1 3.5 1.4 0.2]
## [ 4.9 3. 1.4 0.2]
## [ 4.7 3.2 1.3 0.2]
## [ 4.6 3.1 1.5 0.2]
## [ 5. 3.6 1.4 0.2]
## [ 4.6 3.4 1.4 0.3]
## [ 5. 3.4 1.5 0.2]
## [ 4.4 2.9 1.4 0.2]
## [ 4.9 3.1 1.5 0.1]
## [ 5.4 3.7 1.5 0.2]
## [ 4.8 3.4 1.6 0.2]
## [ 4.8 3. 1.4 0.1]
## [ 5.1 3.5 1.4 0.3]
## [ 5.1 3.8 1.5 0.3]
## [ 5.1 3.7 1.5 0.4]
## [ 5.1 3.3 1.7 0.5]
## [ 5. 3. 1.6 0.2]
## [ 5. 3.4 1.6 0.4]
## [ 5.2 3.5 1.5 0.2]
## [ 5.2 3.4 1.4 0.2]
## [ 4.7 3.2 1.6 0.2]
## [ 4.8 3.1 1.6 0.2]
## [ 4.9 3.1 1.5 0.2]
## [ 5. 3.2 1.2 0.2]
## [ 4.9 3.6 1.4 0.1]
## [ 4.4 3. 1.3 0.2]
## [ 5.1 3.4 1.5 0.2]
## [ 5. 3.5 1.3 0.3]
## [ 4.4 3.2 1.3 0.2]
## [ 5. 3.5 1.6 0.6]
## [ 4.8 3. 1.4 0.3]
## [ 5.1 3.8 1.6 0.2]
## [ 4.6 3.2 1.4 0.2]
## [ 5.3 3.7 1.5 0.2]
## [ 5. 3.3 1.4 0.2]
## [ 6.9 3.1 4.9 1.5]
## [ 6.6 2.9 4.6 1.3]
## [ 6.1 2.9 4.7 1.4]
## [ 5.8 2.7 4.1 1. ]
## [ 5.6 2.5 3.9 1.1]
## [ 5.9 3.2 4.8 1.8]
## [ 6.4 2.9 4.3 1.3]
## [ 6.6 3. 4.4 1.4]
## [ 6. 2.9 4.5 1.5]
## [ 5.5 2.4 3.8 1.1]
## [ 5.5 2.4 3.7 1. ]
## [ 5.8 2.7 3.9 1.2]
## [ 5.6 3. 4.1 1.3]
## [ 5.5 2.5 4. 1.3]
## [ 6.1 3. 4.6 1.4]
## [ 5.8 2.6 4. 1.2]
## [ 5.6 2.7 4.2 1.3]
## [ 5.7 3. 4.2 1.2]
## [ 5.7 2.9 4.2 1.3]
## [ 5.7 2.8 4.1 1.3]
## [ 5.8 2.7 5.1 1.9]
## [ 6.3 2.9 5.6 1.8]
## [ 5.7 2.5 5. 2. ]
## [ 6.5 3. 5.5 1.8]
## [ 6.9 3.2 5.7 2.3]
## [ 6.3 2.7 4.9 1.8]
## [ 6.2 2.8 4.8 1.8]
## [ 6.1 3. 4.9 1.8]
## [ 6.4 3.1 5.5 1.8]
## [ 6. 3. 4.8 1.8]
## [ 6.7 3.1 5.6 2.4]
## [ 5.8 2.7 5.1 1.9]]
## <class 'sklearn.cluster.dbscan_.DBSCAN'>