SAS Programming for Data Mining: January 2010

[BUG FOUND, IN PROCESS OF FIXING IT]

Gap statistic is a method used to estimate the most possible number of clusters in a partition clustering, noticeablly k-means clustering. This measurement was originated by Trevor Hastie, Robert Tibshirani, and Guenther Walther, all from Standford University.

The final estimate of cluster number is
min_K Gap(K)>=Gap(K+1)-s(K+1)

Code below implements uniform sampling approach (method (a) in their paper, samplemethod=1)and SVD sampling approach (method (b), samplemethod=2).

The simulated data has 5 clusters as seen below, while the Gap Statistic correctly identifies this number of clusters as:
\hat_k = 4, Gap=1.37285, Gap_(K+1)-s_(K+1)=1.6494
\hat_k = 5, Gap=1.65057, Gap_(K+1)-s_(K+1)=1.6485, Gap_K>[Gap-s]_(K+1)
\hat_k = 6, Gap=1.64965, Gap_(K+1)-s_(K+1)=1.6483


/* Gap Statistics of Hastie et al. */

data test;
     length ID 5;
     array _X{50} X1-X50;
  do ID=1 to 4000;
     beta=mod(ID, 5)+1;
  do _j=1 to 50;
     _X[_j]=rannor(6898776)+beta*3;
  end;
  output;
  keep ID X:;
  end;
run;


%macro GAPstat(dsn, ID_var, dsn_GAP, max_K, max_rep, samplemethod=1, seed=147852369);
%local dsn  B K dsid nobs ID dsn_out  features features2 num_features;

%let B=&max_rep;
%let dsid=%sysfunc(open(&dsn));
%let nobs=%sysfunc(attrn(&dsid, nobs));
%let dsid=%sysfunc(close(&dsid));
%put &nobs;

%let ID=&ID_var;
%let dsn_out=_clus_out;
%let ref_dist=_ref_dist;


proc contents data=&dsn out=allvars(keep=name varnum) noprint; run;
proc sort data=allvars; by varnum; run;
proc sql  noprint;
     select name into :features separated by ' '
  from   allvars
  where  name^="&ID"
  ;
     select count(distinct name) into :num_features 
  from   allvars
  where  name^="&ID"
  ;
quit;


%if &samplemethod eq 1 %then %do;
proc means data=&dsn noprint;
     var &features;
  output out=_lim(where=(_STAT_ in ('MIN', 'MAX')));
run;
%end;
%else %if &samplemethod eq 2 %then %do;
proc princomp data=&dsn   noint cov noprint
                     outstat=_V(where=(_TYPE_='USCORE'));
    var &features;
run;
data _V_score/view=_V_score;
       set _V;
    _TYPE_='PARMS';
    
run;
proc score data=&dsn  score=_V_score type=parms  out=_Z;
       var &features;
run;
proc means data=_Z  noprint;
       var &features;
    output   out=_lim(where=(_STAT_ in ('MAX', 'MIN')));
run;
%end;
%else %do;
         %put Only value=1 (uniformly sampling) and 2 (SVD sampling) allowed.;
         %goto exit;
%end;

data &ref_dist;
     array _stat{2, &num_features} _temporary_;
  array _F{&num_features}  &features;
  
     do until (eof);
     set _lim end=eof;
     if _STAT_='MIN' then _row=1;
     else _row=2;
     do _k=1 to &num_features; 
              if _row=1 then _stat[_row, _k]=_F[_k];
     else _stat[_row, _k]=_F[_k]-_stat[_row-1, _k];
     end;
 end;
  
     
  do _B=1 to &B;
     do &ID=1 to &nobs;
     do _k=1 to &num_features;
        _F[_k]=ranuni(&seed)*(_stat[2, _k]) + _stat[1, _k];
     end;    
     keep _B &ID &features;
     output;
  end;
 end;
run;

%if &samplemethod eq 2 %then %do;
proc transpose data=_V  out=_Vt;  run;
proc sql noprint;
       select  name into :features2 separated by ' ", "'
    from   allvars
    where  upcase(name)^=("&ID")
    ;
quit;
data _Vt_score/view=_Vt_Score;     
    set _Vt; 
    set allvars(where=(name in ("&features2")));
       retain  _TYPE_ "PARMS"; 
    _LABEL_=_NAME_;
    _NAME_=name;
    drop name;
run;
proc score data=&ref_dist   score=_Vt_score   type=parms
                out=&ref_dist.(keep=_B  &ID &features);
    var &features;
run;
%end;

%do K=1 %to &max_K;
    proc fastclus data=&dsn  maxclusters=&K  
              out=&dsn_out.(keep=&ID Distance CLUSTER) 
              noprint;
         var &features;
    run;

    proc fastclus data=&ref_dist maxclusters=&K  
              out=&dsn_out._&K.(keep=&ID _B Distance CLUSTER)  
              noprint;
         by _B;
         var &features;
    run;

 proc sort data=&dsn_out; by CLUSTER; run;
 proc sort data=&dsn_out._&K; by _B CLUSTER; run;

    proc means data=&dsn_out  sum noprint;          
      class CLUSTER;
         var Distance;
         output out=_dist_  sum(Distance)=Distortion;
    run;
    proc means data=&dsn_out._&K  sum noprint;
         by _B  CLUSTER;
         var Distance;
         output out=_dist_&K  sum(Distance)=Distortion;
    run;
    data _dist_; 
      set _dist_;
   _FREQ_=_FREQ_/&nobs;
 run;
 data _dist_&K;
      set _dist_&K;
   _FREQ_=_FREQ_/&nobs;
 run;
 proc means data=_dist_ noprint  sum;
      var Distortion;
   weight _FREQ_;
   output out=_dist_  sum(distortion)=distortion;
 run;
 proc means data=_dist_&K noprint  sum;
      by _B;
   var Distortion;
   weight _FREQ_;
   output out=_dist_&K  sum(distortion)=distortion;
 run;

    data _dist_&K(drop=mean _B:) _logmean_&K.(keep=mean);
         array _D{&B} _B1-_B&B;
         set _dist_&K end=eof;
         logD=log(Distortion)+log(&nobs);
         _D[_n_]=logD;
         output _dist_&k;
         if eof then do;     
            mean=mean(of _D[*]);
            output  _logmean_&K;  
         end;
    run;

    proc means data=_dist_&K  noprint;
         var logD;
         output out=_s_&K.(where=(_STAT_='STD'));
    run;

    data Gap_&K;
        set _dist_; set _logmean_&K ; set _s_&K(keep=logD);
  Wk=log(distortion)+log(&nobs);
        GAP=mean-Wk;
        s=logD*sqrt(1+1/&B);
        keep mean  Wk  GAP s;
    run;
%end;

data &dsn_GAP;
     set %do K=1 %to &max_K;
         Gap_&K
   %end;;
run;
%exit:
%mend;

%let dsn=test;
%let ID=ID;
%let dsn_GAP=GAP;
%let max_K=10;
%let max_rep=100;
%GAPStat(&dsn, &ID, &dsn_GAP, &max_K, &max_rep, samplemethod=2);

A reference paper can be found at:
R. Tibshirani, G. Walther, and T. Hastie. 2001.
Estimating the number of clusters in a dataset via the Gap statistic.
Journal of the Royal Statistics Society (Series B), pp. 411--423.

Or the book by R. Tibshirani, G. Walther, and T. Hastie:
The Elements of Statistical Learning

The Elements of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition (Springer Series in Statistics)

SAS Programming for Data Mining

Page Title

Thursday, January 21, 2010

Implementing Gap statistic for clustering number estimation

Pageviews last month