本文主要介绍python中实现聚类算法的小实例。在python中主要使用scikit-learn模块中的相关方法实现聚类算法。

http://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering

层次聚类-AgglomerativeClustering

http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering

# !/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "abdata"

import time
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
# from sklearn import datasets
# import matplotlib.pyplot as plt

mdata = pd.read_csv('http://data.galaxystatistics.com/blog_data/regression/iris.csv')

X = mdata.iloc[:,2:6]
print(X.head())
print(X.shape)
print(type(X))

t0 = time.time()
model  = AgglomerativeClustering(n_clusters=3,affinity='euclidean',linkage='ward',compute_full_tree='false').fit(X)
t = time.time() - t0

print(t)

print(model.labels_)
print(model.n_leaves_)
print(model.n_components_)
print(model.children_)

print(type(model))
##    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
## 0           5.1          3.5           1.4          0.2
## 1           4.9          3.0           1.4          0.2
## 2           4.7          3.2           1.3          0.2
## 3           4.6          3.1           1.5          0.2
## 4           5.0          3.6           1.4          0.2
## (150, 4)
## <class 'pandas.core.frame.DataFrame'>
## 0.020556211471557617
## [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
##  2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 0 2 2 2 0 2 2 2 0 2 2 2 0 2
##  2 0]
## 150
## 1
## [[101 142]
##  [  7  39]
##  [  0  17]
##  [  9  34]
##  [128 132]
##  [ 10  48]
##  [  4  37]
##  [ 19  21]
##  [ 29  30]
##  [ 57  93]
##  [ 80  81]
##  [116 137]
##  [  8  38]
##  [ 27  28]
##  [  3  47]
##  [ 82  92]
##  [ 95  96]
##  [127 138]
##  [  1  45]
##  [ 63  91]
##  [ 65  75]
##  [ 40 152]
##  [123 126]
##  [ 49 151]
##  [112 139]
##  [ 94  99]
##  [ 12 168]
##  [ 88 166]
##  [ 66  84]
##  [ 23  26]
##  [ 53  89]
##  [ 74  97]
##  [ 25 153]
##  [ 46 157]
##  [  2 164]
##  [110 147]
##  [120 143]
##  [136 148]
##  [ 78 169]
##  [ 69 160]
##  [ 54  58]
##  [140 144]
##  [141 145]
##  [ 43 179]
##  [ 68  87]
##  [ 50  52]
##  [ 51  56]
##  [107 130]
##  [105 122]
##  [103 161]
##  [163 171]
##  [ 20  31]
##  [ 11 158]
##  [ 67 165]
##  [ 70 167]
##  [ 42 162]
##  [113 150]
##  [  6 184]
##  [173 200]
##  [ 55  90]
##  [176 182]
##  [ 86 195]
##  [124 186]
##  [ 83 133]
##  [  5  18]
##  [ 13 205]
##  [175 177]
##  [ 32  33]
##  [125 129]
##  [104 154]
##  [ 73 188]
##  [149 204]
##  [146 172]
##  [121 206]
##  [ 36 155]
##  [ 76 190]
##  [115 187]
##  [ 61  71]
##  [156 208]
##  [ 72 213]
##  [117 131]
##  [191 212]
##  [ 24 202]
##  [ 98 159]
##  [ 16 224]
##  [ 35 210]
##  [ 64  79]
##  [ 85 196]
##  [ 77 185]
##  [ 44 183]
##  [111 199]
##  [180 189]
##  [102 218]
##  [174 192]
##  [181 227]
##  [170 225]
##  [118 198]
##  [ 14  15]
##  [178 209]
##  [222 229]
##  [201 234]
##  [114 223]
##  [ 60 233]
##  [217 247]
##  [ 59 241]
##  [207 232]
##  [197 242]
##  [ 62 203]
##  [214 250]
##  [119 194]
##  [100 226]
##  [108 219]
##  [216 248]
##  [211 245]
##  [240 261]
##  [193 239]
##  [109 135]
##  [235 255]
##  [238 243]
##  [236 254]
##  [ 22 215]
##  [220 244]
##  [228 265]
##  [257 269]
##  [134 249]
##  [221 237]
##  [231 260]
##  [ 41 270]
##  [230 266]
##  [106 262]
##  [253 258]
##  [259 274]
##  [267 277]
##  [264 268]
##  [271 275]
##  [246 278]
##  [251 281]
##  [276 283]
##  [256 285]
##  [273 279]
##  [272 280]
##  [263 284]
##  [252 289]
##  [286 291]
##  [282 290]
##  [287 288]
##  [292 293]
##  [295 296]
##  [294 297]]
## <class 'sklearn.cluster.hierarchical.AgglomerativeClustering'>

KMeans聚类-KMeans

http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans

# !/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "abdata"

import time
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
# from sklearn import datasets
# import matplotlib.pyplot as plt

mdata = pd.read_csv('http://data.galaxystatistics.com/blog_data/regression/iris.csv')

X = mdata.iloc[:,2:6]
print(X.head())
print(X.shape)
print(type(X))

t0 = time.time()
model = KMeans(init='k-means++',n_clusters=3, random_state=8).fit(X)
t = time.time() - t0

print(t)
print(model.labels_)
print(model.cluster_centers_)
print(model.inertia_)

print(type(model))
##    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
## 0           5.1          3.5           1.4          0.2
## 1           4.9          3.0           1.4          0.2
## 2           4.7          3.2           1.3          0.2
## 3           4.6          3.1           1.5          0.2
## 4           5.0          3.6           1.4          0.2
## (150, 4)
## <class 'pandas.core.frame.DataFrame'>
## 0.02309250831604004
## [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
##  2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
##  2 0]
## [[ 5.9016129   2.7483871   4.39354839  1.43387097]
##  [ 5.006       3.428       1.462       0.246     ]
##  [ 6.85        3.07368421  5.74210526  2.07105263]]
## 78.8514414261
## <class 'sklearn.cluster.k_means_.KMeans'>

密度聚类-DBSCAN

http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN

# !/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "abdata"

import time
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
# from sklearn import datasets
# import matplotlib.pyplot as plt

mdata = pd.read_csv('http://data.galaxystatistics.com/blog_data/regression/iris.csv')

X = mdata.iloc[:,2:6]
print(X.head())
print(X.shape)
print(type(X))

t0 = time.time()
model = DBSCAN(eps=0.3, min_samples=3).fit(X)
t = time.time() - t0

print(t)
print(model.labels_)
print(model.core_sample_indices_)
print(model.components_)

print(type(model))
##    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
## 0           5.1          3.5           1.4          0.2
## 1           4.9          3.0           1.4          0.2
## 2           4.7          3.2           1.3          0.2
## 3           4.6          3.1           1.5          0.2
## 4           5.0          3.6           1.4          0.2
## (150, 4)
## <class 'pandas.core.frame.DataFrame'>
## 0.002005338668823242
## [ 0  0  0  0  0 -1  0  0  0  0  0  0  0  0 -1 -1 -1  0 -1  0 -1  0 -1  0  0
##   0  0  0  0  0  0 -1 -1 -1  0  0  0  0  0  0  0 -1  0  0 -1  0  0  0  0  0
##   1 -1  1  4  2 -1 -1 -1  2 -1 -1 -1 -1  3 -1  2 -1  4 -1  4  5 -1 -1  3  2
##   2 -1 -1  3 -1  4  4  4 -1 -1 -1  1 -1  4  4  4  3  4 -1  4  4  4  2 -1  4
##  -1  6 -1  7 -1 -1 -1 -1 -1 -1 -1 -1 -1  6 -1 -1  7 -1 -1 -1  8 -1 -1  5  8
##  -1  5  5 -1 -1 -1 -1 -1 -1 -1 -1 -1  7  5 -1  8 -1  6  8  8 -1  5 -1 -1  5]
## [  0   1   2   3   4   6   7   8   9  10  11  12  17  19  21  23  25  26
##   27  28  29  30  34  35  37  38  39  40  42  43  45  46  47  48  49  52
##   58  63  67  69  70  74  75  78  80  81  82  88  89  91  92  94  95  96
##   99 101 103 113 116 120 123 126 127 137 138 140 142]
## [[ 5.1  3.5  1.4  0.2]
##  [ 4.9  3.   1.4  0.2]
##  [ 4.7  3.2  1.3  0.2]
##  [ 4.6  3.1  1.5  0.2]
##  [ 5.   3.6  1.4  0.2]
##  [ 4.6  3.4  1.4  0.3]
##  [ 5.   3.4  1.5  0.2]
##  [ 4.4  2.9  1.4  0.2]
##  [ 4.9  3.1  1.5  0.1]
##  [ 5.4  3.7  1.5  0.2]
##  [ 4.8  3.4  1.6  0.2]
##  [ 4.8  3.   1.4  0.1]
##  [ 5.1  3.5  1.4  0.3]
##  [ 5.1  3.8  1.5  0.3]
##  [ 5.1  3.7  1.5  0.4]
##  [ 5.1  3.3  1.7  0.5]
##  [ 5.   3.   1.6  0.2]
##  [ 5.   3.4  1.6  0.4]
##  [ 5.2  3.5  1.5  0.2]
##  [ 5.2  3.4  1.4  0.2]
##  [ 4.7  3.2  1.6  0.2]
##  [ 4.8  3.1  1.6  0.2]
##  [ 4.9  3.1  1.5  0.2]
##  [ 5.   3.2  1.2  0.2]
##  [ 4.9  3.6  1.4  0.1]
##  [ 4.4  3.   1.3  0.2]
##  [ 5.1  3.4  1.5  0.2]
##  [ 5.   3.5  1.3  0.3]
##  [ 4.4  3.2  1.3  0.2]
##  [ 5.   3.5  1.6  0.6]
##  [ 4.8  3.   1.4  0.3]
##  [ 5.1  3.8  1.6  0.2]
##  [ 4.6  3.2  1.4  0.2]
##  [ 5.3  3.7  1.5  0.2]
##  [ 5.   3.3  1.4  0.2]
##  [ 6.9  3.1  4.9  1.5]
##  [ 6.6  2.9  4.6  1.3]
##  [ 6.1  2.9  4.7  1.4]
##  [ 5.8  2.7  4.1  1. ]
##  [ 5.6  2.5  3.9  1.1]
##  [ 5.9  3.2  4.8  1.8]
##  [ 6.4  2.9  4.3  1.3]
##  [ 6.6  3.   4.4  1.4]
##  [ 6.   2.9  4.5  1.5]
##  [ 5.5  2.4  3.8  1.1]
##  [ 5.5  2.4  3.7  1. ]
##  [ 5.8  2.7  3.9  1.2]
##  [ 5.6  3.   4.1  1.3]
##  [ 5.5  2.5  4.   1.3]
##  [ 6.1  3.   4.6  1.4]
##  [ 5.8  2.6  4.   1.2]
##  [ 5.6  2.7  4.2  1.3]
##  [ 5.7  3.   4.2  1.2]
##  [ 5.7  2.9  4.2  1.3]
##  [ 5.7  2.8  4.1  1.3]
##  [ 5.8  2.7  5.1  1.9]
##  [ 6.3  2.9  5.6  1.8]
##  [ 5.7  2.5  5.   2. ]
##  [ 6.5  3.   5.5  1.8]
##  [ 6.9  3.2  5.7  2.3]
##  [ 6.3  2.7  4.9  1.8]
##  [ 6.2  2.8  4.8  1.8]
##  [ 6.1  3.   4.9  1.8]
##  [ 6.4  3.1  5.5  1.8]
##  [ 6.   3.   4.8  1.8]
##  [ 6.7  3.1  5.6  2.4]
##  [ 5.8  2.7  5.1  1.9]]
## <class 'sklearn.cluster.dbscan_.DBSCAN'>