Gap statistic is a method used to estimate the most possible number of clusters in a partition clustering, noticeablly k-means clustering. This measurement was originated by Trevor Hastie, Robert Tibshirani, and Guenther Walther, all from Standford University.
The final estimate of cluster number is
min_K Gap(K)>=Gap(K+1)-s(K+1)
Code below implements uniform sampling approach (method (a) in their paper, samplemethod=1)and SVD sampling approach (method (b), samplemethod=2).
The simulated data has 5 clusters as seen below, while the Gap Statistic correctly identifies this number of clusters as:
\hat_k = 4, Gap=1.37285, Gap_(K+1)-s_(K+1)=1.6494
\hat_k = 5, Gap=1.65057, Gap_(K+1)-s_(K+1)=1.6485, Gap_K>[Gap-s]_(K+1)
\hat_k = 6, Gap=1.64965, Gap_(K+1)-s_(K+1)=1.6483
/* Gap Statistics of Hastie et al. */
data test;
length ID 5;
array _X{50} X1-X50;
do ID=1 to 4000;
beta=mod(ID, 5)+1;
do _j=1 to 50;
_X[_j]=rannor(6898776)+beta*3;
end;
output;
keep ID X:;
end;
run;
%macro GAPstat(dsn, ID_var, dsn_GAP, max_K, max_rep, samplemethod=1, seed=147852369);
%local dsn B K dsid nobs ID dsn_out features features2 num_features;
%let B=&max_rep;
%let dsid=%sysfunc(open(&dsn));
%let nobs=%sysfunc(attrn(&dsid, nobs));
%let dsid=%sysfunc(close(&dsid));
%put &nobs;
%let ID=&ID_var;
%let dsn_out=_clus_out;
%let ref_dist=_ref_dist;
proc contents data=&dsn out=allvars(keep=name varnum) noprint; run;
proc sort data=allvars; by varnum; run;
proc sql noprint;
select name into :features separated by ' '
from allvars
where name^="&ID"
;
select count(distinct name) into :num_features
from allvars
where name^="&ID"
;
quit;
%if &samplemethod eq 1 %then %do;
proc means data=&dsn noprint;
var &features;
output out=_lim(where=(_STAT_ in ('MIN', 'MAX')));
run;
%end;
%else %if &samplemethod eq 2 %then %do;
proc princomp data=&dsn noint cov noprint
outstat=_V(where=(_TYPE_='USCORE'));
var &features;
run;
data _V_score/view=_V_score;
set _V;
_TYPE_='PARMS';
run;
proc score data=&dsn score=_V_score type=parms out=_Z;
var &features;
run;
proc means data=_Z noprint;
var &features;
output out=_lim(where=(_STAT_ in ('MAX', 'MIN')));
run;
%end;
%else %do;
%put Only value=1 (uniformly sampling) and 2 (SVD sampling) allowed.;
%goto exit;
%end;
data &ref_dist;
array _stat{2, &num_features} _temporary_;
array _F{&num_features} &features;
do until (eof);
set _lim end=eof;
if _STAT_='MIN' then _row=1;
else _row=2;
do _k=1 to &num_features;
if _row=1 then _stat[_row, _k]=_F[_k];
else _stat[_row, _k]=_F[_k]-_stat[_row-1, _k];
end;
end;
do _B=1 to &B;
do &ID=1 to &nobs;
do _k=1 to &num_features;
_F[_k]=ranuni(&seed)*(_stat[2, _k]) + _stat[1, _k];
end;
keep _B &ID &features;
output;
end;
end;
run;
%if &samplemethod eq 2 %then %do;
proc transpose data=_V out=_Vt; run;
proc sql noprint;
select name into :features2 separated by ' ", "'
from allvars
where upcase(name)^=("&ID")
;
quit;
data _Vt_score/view=_Vt_Score;
set _Vt;
set allvars(where=(name in ("&features2")));
retain _TYPE_ "PARMS";
_LABEL_=_NAME_;
_NAME_=name;
drop name;
run;
proc score data=&ref_dist score=_Vt_score type=parms
out=&ref_dist.(keep=_B &ID &features);
var &features;
run;
%end;
%do K=1 %to &max_K;
proc fastclus data=&dsn maxclusters=&K
out=&dsn_out.(keep=&ID Distance CLUSTER)
noprint;
var &features;
run;
proc fastclus data=&ref_dist maxclusters=&K
out=&dsn_out._&K.(keep=&ID _B Distance CLUSTER)
noprint;
by _B;
var &features;
run;
proc sort data=&dsn_out; by CLUSTER; run;
proc sort data=&dsn_out._&K; by _B CLUSTER; run;
proc means data=&dsn_out sum noprint;
class CLUSTER;
var Distance;
output out=_dist_ sum(Distance)=Distortion;
run;
proc means data=&dsn_out._&K sum noprint;
by _B CLUSTER;
var Distance;
output out=_dist_&K sum(Distance)=Distortion;
run;
data _dist_;
set _dist_;
_FREQ_=_FREQ_/&nobs;
run;
data _dist_&K;
set _dist_&K;
_FREQ_=_FREQ_/&nobs;
run;
proc means data=_dist_ noprint sum;
var Distortion;
weight _FREQ_;
output out=_dist_ sum(distortion)=distortion;
run;
proc means data=_dist_&K noprint sum;
by _B;
var Distortion;
weight _FREQ_;
output out=_dist_&K sum(distortion)=distortion;
run;
data _dist_&K(drop=mean _B:) _logmean_&K.(keep=mean);
array _D{&B} _B1-_B&B;
set _dist_&K end=eof;
logD=log(Distortion)+log(&nobs);
_D[_n_]=logD;
output _dist_&k;
if eof then do;
mean=mean(of _D[*]);
output _logmean_&K;
end;
run;
proc means data=_dist_&K noprint;
var logD;
output out=_s_&K.(where=(_STAT_='STD'));
run;
data Gap_&K;
set _dist_; set _logmean_&K ; set _s_&K(keep=logD);
Wk=log(distortion)+log(&nobs);
GAP=mean-Wk;
s=logD*sqrt(1+1/&B);
keep mean Wk GAP s;
run;
%end;
data &dsn_GAP;
set %do K=1 %to &max_K;
Gap_&K
%end;;
run;
%exit:
%mend;
%let dsn=test;
%let ID=ID;
%let dsn_GAP=GAP;
%let max_K=10;
%let max_rep=100;
%GAPStat(&dsn, &ID, &dsn_GAP, &max_K, &max_rep, samplemethod=2);
A reference paper can be found at:
R. Tibshirani, G. Walther, and T. Hastie. 2001.
Estimating the number of clusters in a dataset via the Gap statistic.
Journal of the Royal Statistics Society (Series B), pp. 411--423.
Or the book by R. Tibshirani, G. Walther, and T. Hastie:
The Elements of Statistical Learning
4 comments:
What is the bug in this code?
Someone sent me an email off line stating that there was a bug for method 2. Haven't had time to investigate it recently, but will keep readers posted.
Have there been any further refinements to this macro? It could be very useful. I'm unable to replicate similar results with method=1. Thanks
Haven't got time to finish the bug correction. But I may have some time in Nov to get it corrected.
Post a Comment