Thursday, January 21, 2010

Implementing Gap statistic for clustering number estimation

[BUG FOUND, IN PROCESS OF FIXING IT]

Gap statistic is a method used to estimate the most possible number of clusters in a partition clustering, noticeablly k-means clustering. This measurement was originated by Trevor Hastie, Robert Tibshirani, and Guenther Walther, all from Standford University.


The final estimate of cluster number is
min_K Gap(K)>=Gap(K+1)-s(K+1)

Code below implements uniform sampling approach (method (a) in their paper, samplemethod=1)and SVD sampling approach (method (b), samplemethod=2).

The simulated data has 5 clusters as seen below, while the Gap Statistic correctly identifies this number of clusters as:
\hat_k = 4, Gap=1.37285, Gap_(K+1)-s_(K+1)=1.6494
\hat_k = 5, Gap=1.65057, Gap_(K+1)-s_(K+1)=1.6485, Gap_K>[Gap-s]_(K+1)
\hat_k = 6, Gap=1.64965, Gap_(K+1)-s_(K+1)=1.6483



/* Gap Statistics of Hastie et al. */

data test;
     length ID 5;
     array _X{50} X1-X50;
  do ID=1 to 4000;
     beta=mod(ID, 5)+1;
  do _j=1 to 50;
     _X[_j]=rannor(6898776)+beta*3;
  end;
  output;
  keep ID X:;
  end;
run;


%macro GAPstat(dsn, ID_var, dsn_GAP, max_K, max_rep, samplemethod=1, seed=147852369);
%local dsn  B K dsid nobs ID dsn_out  features features2 num_features;

%let B=&max_rep;
%let dsid=%sysfunc(open(&dsn));
%let nobs=%sysfunc(attrn(&dsid, nobs));
%let dsid=%sysfunc(close(&dsid));
%put &nobs;

%let ID=&ID_var;
%let dsn_out=_clus_out;
%let ref_dist=_ref_dist;


proc contents data=&dsn out=allvars(keep=name varnum) noprint; run;
proc sort data=allvars; by varnum; run;
proc sql  noprint;
     select name into :features separated by ' '
  from   allvars
  where  name^="&ID"
  ;
     select count(distinct name) into :num_features 
  from   allvars
  where  name^="&ID"
  ;
quit;


%if &samplemethod eq 1 %then %do;
proc means data=&dsn noprint;
     var &features;
  output out=_lim(where=(_STAT_ in ('MIN', 'MAX')));
run;
%end;
%else %if &samplemethod eq 2 %then %do;
proc princomp data=&dsn   noint cov noprint
                     outstat=_V(where=(_TYPE_='USCORE'));
    var &features;
run;
data _V_score/view=_V_score;
       set _V;
    _TYPE_='PARMS';
    
run;
proc score data=&dsn  score=_V_score type=parms  out=_Z;
       var &features;
run;
proc means data=_Z  noprint;
       var &features;
    output   out=_lim(where=(_STAT_ in ('MAX', 'MIN')));
run;
%end;
%else %do;
         %put Only value=1 (uniformly sampling) and 2 (SVD sampling) allowed.;
         %goto exit;
%end;

data &ref_dist;
     array _stat{2, &num_features} _temporary_;
  array _F{&num_features}  &features;
  
     do until (eof);
     set _lim end=eof;
     if _STAT_='MIN' then _row=1;
     else _row=2;
     do _k=1 to &num_features; 
              if _row=1 then _stat[_row, _k]=_F[_k];
     else _stat[_row, _k]=_F[_k]-_stat[_row-1, _k];
     end;
 end;
  
     
  do _B=1 to &B;
     do &ID=1 to &nobs;
     do _k=1 to &num_features;
        _F[_k]=ranuni(&seed)*(_stat[2, _k]) + _stat[1, _k];
     end;    
     keep _B &ID &features;
     output;
  end;
 end;
run;

%if &samplemethod eq 2 %then %do;
proc transpose data=_V  out=_Vt;  run;
proc sql noprint;
       select  name into :features2 separated by ' ", "'
    from   allvars
    where  upcase(name)^=("&ID")
    ;
quit;
data _Vt_score/view=_Vt_Score;     
    set _Vt; 
    set allvars(where=(name in ("&features2")));
       retain  _TYPE_ "PARMS"; 
    _LABEL_=_NAME_;
    _NAME_=name;
    drop name;
run;
proc score data=&ref_dist   score=_Vt_score   type=parms
                out=&ref_dist.(keep=_B  &ID &features);
    var &features;
run;
%end;

%do K=1 %to &max_K;
    proc fastclus data=&dsn  maxclusters=&K  
              out=&dsn_out.(keep=&ID Distance CLUSTER) 
              noprint;
         var &features;
    run;

    proc fastclus data=&ref_dist maxclusters=&K  
              out=&dsn_out._&K.(keep=&ID _B Distance CLUSTER)  
              noprint;
         by _B;
         var &features;
    run;

 proc sort data=&dsn_out; by CLUSTER; run;
 proc sort data=&dsn_out._&K; by _B CLUSTER; run;

    proc means data=&dsn_out  sum noprint;          
      class CLUSTER;
         var Distance;
         output out=_dist_  sum(Distance)=Distortion;
    run;
    proc means data=&dsn_out._&K  sum noprint;
         by _B  CLUSTER;
         var Distance;
         output out=_dist_&K  sum(Distance)=Distortion;
    run;
    data _dist_; 
      set _dist_;
   _FREQ_=_FREQ_/&nobs;
 run;
 data _dist_&K;
      set _dist_&K;
   _FREQ_=_FREQ_/&nobs;
 run;
 proc means data=_dist_ noprint  sum;
      var Distortion;
   weight _FREQ_;
   output out=_dist_  sum(distortion)=distortion;
 run;
 proc means data=_dist_&K noprint  sum;
      by _B;
   var Distortion;
   weight _FREQ_;
   output out=_dist_&K  sum(distortion)=distortion;
 run;

    data _dist_&K(drop=mean _B:) _logmean_&K.(keep=mean);
         array _D{&B} _B1-_B&B;
         set _dist_&K end=eof;
         logD=log(Distortion)+log(&nobs);
         _D[_n_]=logD;
         output _dist_&k;
         if eof then do;     
            mean=mean(of _D[*]);
            output  _logmean_&K;  
         end;
    run;

    proc means data=_dist_&K  noprint;
         var logD;
         output out=_s_&K.(where=(_STAT_='STD'));
    run;

    data Gap_&K;
        set _dist_; set _logmean_&K ; set _s_&K(keep=logD);
  Wk=log(distortion)+log(&nobs);
        GAP=mean-Wk;
        s=logD*sqrt(1+1/&B);
        keep mean  Wk  GAP s;
    run;
%end;

data &dsn_GAP;
     set %do K=1 %to &max_K;
         Gap_&K
   %end;;
run;
%exit:
%mend;

%let dsn=test;
%let ID=ID;
%let dsn_GAP=GAP;
%let max_K=10;
%let max_rep=100;
%GAPStat(&dsn, &ID, &dsn_GAP, &max_K, &max_rep, samplemethod=2);


A reference paper can be found at:
R. Tibshirani, G. Walther, and T. Hastie. 2001.
Estimating the number of clusters in a dataset via the Gap statistic.
Journal of the Royal Statistics Society (Series B), pp. 411--423.

Or the book by R. Tibshirani, G. Walther, and T. Hastie:
The Elements of Statistical Learning

The Elements of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition (Springer Series in Statistics)