with(LinearAlgebra): with(combinat): #Written by JOSEPH IRGON, Rutgers University as a Final Project for Math 640, Spring 2006 #(taught by Dr. Z. (Doron Zeilberger) # #Kmeans(Data,Centers,k,maxi,mind) #Inputs Data in a matrix containting variables in columns, and records in rows #maxi is the maxinum number iterations #mind is the mininum converging distance #Centers is the initial centeroids #k is number of clusters you want #Outputs 3 things: [1] Vector of Centers of each cluster [2] list of clusters containing the #index of the records that belong to the cluster [3] the vector of the sum of squared #distances of each cluster kMeans:=proc(Data,Centers,k,maxi,mind) local i,j,l,m,n,ctr1,rec,champ,iCenters,nCenters,tmp,Clusters,Clusters1,dist,dist1,distances: #n is number of records #m is number of variables (or dimension) n:=RowDimension(Data): m:=ColumnDimension(Data): iCenters:=Centers: dist1:=infinity: Clusters1:=[[]$k]: for l to maxi do distances:=[0$k]: Clusters:=[[]$k]: nCenters:=Matrix(k,m): # For each record in Data, decide which centriod is the closest and put it in that cluster for j to n do rec:=infinity: for i from 1 to k do ctr1:=iCenters[i,1..m]: tmp:=VectorNorm(ctr1-Data[j,1..m],Euclidean): if evalf(tmp)(abs(dist1-dist)/dist) then # print(`Converged by min dist`): RETURN(nCenters,Clusters,distances): fi: iCenters:=nCenters: Clusters1:=Clusters: dist1:=dist: od: end: #Initialize Centroids in a random fasion, outputs a vector of random centrs InitCentroids:=proc(Data,k) Data[randperm(RowDimension(Data))[1..k],1..ColumnDimension(Data)]: end: #ISODATA algorithm as found on the internet, an upgrade to the kmeans algorithm developed in #1999. Merges clusters that are closer than maxd (max distance) and splits clusters that #have variablitiy greater than var #outputs the centers, clusters, distances of each cluster and total distance ISODATA:=proc(Data,k,var,maxd,maxi,mind) local i,m,k1,tmpCenters,tmpC,Clusters,Clustering,nCenters,tCenters,indx,dist,totdist,iter,i1: global Cluster,Centers: #var is the intracluster variablity #maxd is the intercluster maximum distance for merging m:=ColumnDimension(Data): k1:=k: Centers:=InitCentroids(Data,k): Clustering:=kMeans(Data,Centers,k,maxi,mind): Centers:=Clustering[1]: dist:=Clustering[3]: Cluster:=Clustering[2]: iter:=1: #check for splitting opportunities while max(op(dist))>var and itervar then tCenters:=InitCentroids(Data[Cluster[i],1..m],2): tmpC:=kMeans(Data[Cluster[i],1..m],tCenters,2,maxi,mind): tmpCenters:=tmpC[1]: nCenters[i1,1..m]:=tmpCenters[1,1..m]: nCenters[i1+1,1..m]:=tmpCenters[2,1..m]: i1:=i1+2: else nCenters[i1,1..m]:=Centers[i,1..m]: i1:=i1+1: fi: od: k1:=i1-1: Clustering:=kMeans(Data,nCenters,k1,maxi,mind): Centers:=Clustering[1]: dist:=Clustering[3]: Cluster:=Clustering[2]: iter:=iter+1: od: #check for merging opportunities while MergeCenters(maxd) do k1:=k1-1: Clustering:=kMeans(Data,Centers,k1,maxi,mind): Centers:=Clustering[1]: Cluster:=Clustering[2]: od: totdist:=`+`(op(Clustering[3])): Clustering,totdist: end: MergeCenters:=proc(maxd) local i, j,k,m,rec,champ,dd,ni,nj: global Centers,Cluster: k:=nops(Cluster): m:=ColumnDimension(Centers): rec:=infinity: for i to k do for j to i-1 do dd:=VectorNorm(Centers[i,1..m]-Centers[j,1..m],Euclidean): if dd: od: Data:=DeleteRow(Data,1); end: