SAS Programming for Data Mining: February 2010

Wednesday, February 17, 2010

An efficient macro for Stump - two terminal nodes tree

In this post, I post an improved SAS macro of the single partition split algorithm in Chapter 2 of "Pharmaceutical Statistics Using SAS: A Practical Guide" by Alex Dmitrienko, Christy Chuang-Stein, Ralph B. D'Agostino.

The single partition split algorithm is a simplified version of Stumps, and is a weak classifier, usually used to form the base weak learner for boosting algorithm. This specific classifier seeks to separate the space into 2 subspaces for independent variables where each subspace has increasingly higher purity of the response classes, say 1 and 0. In the examples, Gini Index is used to measure purity/impurity.

The SAS example macro %SPLIT in the book (found @ Here) is for illustration purpose, and is so inefficient that it practically can't be used by industrial standard.

I modified this macro and made it usable in real business applications where millions of observations and hundreds of variables are more than common.

Improved Code:



%macro Dsplit(dsn, p);
/************************************************/
/* dsn: Name of input SAS data sets. All        */
/*        independent variables should be named */
/*        as X1, X2,....,Xp and be continous    */
/*        numeric variables                     */
/*   p: Number of independent variables         */
/************************************************/
options nonotes;
%do i=1 %to &p;
proc sort data=&dsn.(keep=y w x&i)  out=work.sort; by x&i;
run;

proc means data=work.sort noprint;
     var y;
     weight w;
     output out=_ysum  n(y)=ntotal  sum(y)=ysum;
run;
data _null_;
     set _ysum;
     call symput('ntotal', ntotal);
     call symput('ysum', ysum); 
run;
data y_pred;
     set work.sort  end=eof;
     array _p[&ntotal, 2] _temporary_;
     array _g[&ntotal, 2] _temporary_;
     array _x[&ntotal]    _temporary_;
     retain _y1  _oldx 0;
     if _n_=1 then _oldx=x&i;
     _x[_n_]=x&i;
     if ^eof then do;
        _y1+y; 
       _p[_n_, 1]=_y1/_n_; _p[_n_, 2]=(&ysum-_y1)/(&ntotal-_n_);
       ppn1=_n_/&ntotal;  ppn2=1-ppn1;
       _g[_n_, 1]=2*(ppn1*(1-_p[_n_, 1])*_p[_n_, 1]+ppn2*(1-_p[_n_, 2])*_p[_n_, 2]);
       if _n_>1 then _g[_n_-1, 2]=(_oldx+x&i)/2;
       _oldx=x&i;
     end;
     else do;
       _g[_n_-1, 2]=(_oldx+x&i)/2;
       ginimin=2;
       do i=1 to &ntotal-1;
          gini=_g[i, 1]; x&i=_g[i, 2];
          keep gini x&i;
          output;
     if gini lt ginimin then do;
        ginimin=gini; xmin=x&i;
            p1_LH=_P[i, 1]; p0_LH=1-p1_LH;
              p1_RH=_P[i, 2]; p0_RH=1-p1_RH;
        c_L=(p1_LH>0.5); c_R=(p1_RH>0.5);
    end;
       end;
     end;  
     do i=1 to &ntotal;
        if _x[i]<=xmin then y_pred=c_L;
        if _x[i]>xmin  then y_pred=c_R;
        keep y_pred y w; output y_pred;
     end;
     call symput('ginimin', ginimin);
     call symput('xmin', xmin);
     end;
run;
data _giniout&i;
     length varname $ 8;
     varname="x&i";
     cutoff=&xmin;
     gini=&ginimin;
run;
%end;
data outsplit;
     set %do i=1 %to &p;
            _giniout&i
         %end;;
run;
proc datasets library=work nolist;
     delete _giniout:;
quit;
option notes;    
%mend;

/* weak classifiers for Boost Algorithm */
%macro gini(_y0wsum, _y1wsum, i, nobs);
data _giniout&i.(keep=varname  mingini cut_val   p0_LH  p1_LH  c_L  p0_RH  p1_RH  c_R);     
     length varname $ 8;
     set sorted  end=eof;
  retain  _y0w  _y1w  _w  _ginik  0;
  retain  p0_LH  p1_LH  p0_RH  p1_RH  c_L  c_R  0; 
  array _mingini{4}  _temporary_; 
  if _n_=1 then do;
     _y0w = (y^=1)*w;  _y1w = (y=1)*w;   _w = w;    
  _mingini[1] = 2;
        _mingini[2] = 1; 
        _mingini[3] = x&i; 
        _mingini[4] = x&i;        
  end;
  else do;
     _y0w + (y^=1)*w; _y1w + (y=1)*w; _w + w;
  end;

  if ^eof then do;       
        p0_L = _y0w/_w;  p0_R = (&_y0wsum - _y0w)/(1-_w);
        p1_L = _y1w/_w;  p1_R = (&_y1wsum - _y1w)/(1-_w);
        _ginik= p1_L*p0_L*_w + p1_R*p0_R*(1-_w);
  end;

  if _ginik<_mingini[1] then do;     
  _mingini[1]=_ginik;   _mingini[2]=_n_; _mingini[3]=x&i;
  p0_LH=p0_L;  p1_LH=p1_L;  p0_RH=p0_R;  p1_RH=p1_R;
  c_L = (p1_LH > 0.5); c_R = (p1_RH > 0.5);  
  end; 
  if _n_=(_mingini[2]+1) then _mingini[4]=x&i;

  if eof then do;   
     cut_val=(_mingini[3]+_mingini[4])/2;
  mingini=_mingini[ 1]; 
        varname="x&i";   
  output  ; 
  end;   

run;
%mend;

%macro stump_gini(dsn, p, outdsn);
/***************************************************/
/*    dsn: Name of input SAS data sets. All        */
/*          independent variables should be named  */
/*          as X1, X2,....,Xp and be continous     */
/*          numeric variables                      */
/*      p: Number of independent variables         */
/* outdsn: Name of output SAS data sets. Used for  */
/*          Subsequent scoring. Not to named as    */
/*          _giniout.....                          */
/***************************************************/
%local i  p  ;

%do i=1 %to &p;
    proc sort data=&dsn.(keep=x&i  y  w)  out=sorted  sortsize=max;
      by x&i;
 run;
 data sortedv/view=sortedv;
      set sorted;
   y1=(y=1); y0=(y^=1);
 run;
 proc means data=sortedv(keep=y0 y1 w)   noprint;
      var y0  y1;
   weight w;
   output out=_ywsum(keep=_y0wsum  _y1wsum  _FREQ_)  
                sum(y0)=_y0wsum  sum(y1)=_y1wsum;
 run;
    data _null_;
      set _ywsum;
   call execute('%gini('|| compress(_y0wsum) || ','
                        || compress(_y1wsum) || ','
                              || compress(&i)      || ','
                              || compress(_FREQ_)  || ')'
                      );
 run;
%end;
data &outdsn;
     set %do i=1 %to &p;
         _giniout&i
   %end;;
run;
proc sort data=&outdsn; by mingini; run;
proc datasets library=work nolist;
     delete %do i=1 %to &p;
            _giniout&i
   %end;;
run;quit;
%mend;


%macro css(_ywsum, i, nobs);
data _regout&i.(keep=varname  mincss cut_val  ypred_L  ypred_R);     
     length varname $ 8;
     set sorted  end=eof;
  retain _yw  _w  0;
  retain  ypred_L  ypred_R 0;
  array _mincss{4}  _temporary_; 
  if _n_=1 then do;
     _yw = y*w;  _w = w;  
  _mincss[1] = constant('BIG'); 
        _mincss[2] = 1; 
        _mincss[3] = x&i; 
        _mincss[4] = x&i;
        ypred_L = _yw/_w;  ypred_R = (&_ywsum-_yw)/(1-_w);  
  end;
  else do;
     _yw + y*w; _w + w;
  end;
  if ^eof then do;     
  cssk = 1 - _yw/_w*_yw - (&_ywsum-_yw)/(1-_w)*(&_ywsum-_yw);   
  end;
  else do;
     cssk = 1 -_yw**2;
  end;
  if cssk<_mincss[1] then do;     
  _mincss[1]=cssk;   _mincss[2]=_n_; _mincss[3]=x&i;
  ypred_L=_yw/_w;  ypred_R=(&_ywsum-_yw)/(1-_w);
  end; 
  if _n_=(_mincss[2]+1) then _mincss[4]=x&i;

  if eof then do;   
     cut_val=(_mincss[3]+_mincss[4])/2;
  mincss=_mincss[ 1]; 
        varname="x&i";   
  output  ;
  end;   
     
run;
%mend;

%macro stump_css(dsn, p, outdsn);
/***************************************************/
/*    dsn: Name of input SAS data sets. All        */
/*          independent variables should be named  */
/*          as X1, X2,....,Xp and be continous     */
/*          numeric variables                      */
/*      p: Number of independent variables         */
/* outdsn: Name of output SAS data sets. Used for  */
/*          Subsequent scoring. Not to named as    */
/*          _giniout.....                          */
/***************************************************/
%local i  p  ;
options nosource;
%do i=1 %to &p;
    proc sort data=&dsn.(keep=x&i  y  w)  out=sorted  sortsize=max;
      by x&i;
 run;
 proc means data=sorted(keep=y w)   noprint;
      var y;
   weight w;
   output out=_ywsum(keep=_ywsum  _FREQ_)  sum(y)=_ywsum;
 run;
    data _null_;
      set _ywsum;
   call execute('%css(' || compress(_ywsum) || ','
                              || compress(&i)     || ','
                              || compress(_FREQ_) || ')'
                      );
 run;
%end;
data &outdsn;
     set %do i=1 %to &p;
         _regout&i
   %end;;
run;
proc sort data=&outdsn; by mincss; run;
options source;
%mend;

Comparing the time used and results.

The example data used is the AUC Small training data, 200 sets of predictors for 15,000 ratings, from AusDM2009 competition, and can be found @ Here .

Using example code from the book, it takes 1563 seconds on a regular Windows desktop (Core2Duo E6750 2.67GHz, 4GB Memory, 7K2 rpm HDD with 8MB cache) to process 5 numerical continous variables, whereas with improved macro, it only takes 1 second to process the same amount of data. This improvement is criticle since weak classifiers like this one won't be used alone, but as the base for more time-consuming Boosting algorithms. With original macro, it is practically not usable for any boosting algorithms on data sets with hundreds of or more observations.

Original Macro:

Improved Macro:

Comparing the results from original macro and the improved macro, they both select X3 with the same partition cut off point and the same Gini Index.

Reference:
Pharmaceutical Statistics Using SAS: A Practical Guide by Alex Dmitrienko, Christy Chuang-Stein, Ralph B. D'Agostino, SAS Publishing 2007

Pharmaceutical Statistics Using SAS: A Practical Guide (SAS Press)

Friday, February 05, 2010

SAS implementation of Kernel PCA

Kernel method is a very useful technique in data mining that is applicable to any algorithms relying on inner product [1]. The key is applying appropriate kernel function to the inner product of original data space.

I show here SAS/STAT+BASE example code for Kernel PCA implementation. The example used is from Wikipedia @ here. Extention to other algorithms that suitable for Kernel method, such as CCA, ICA, LDA, etc, is straightforward.

Reference:
[1]. Zhu, Mu, "Kernels and Ensembles: Perspectives on Statistical Learning", The American Statistician. May 1, 2008, 62(2): 97-109

Figure: Result

SAS demo code:


/* Demonstrate kernel PCA. Kernel method can be applied to any algorithm that 
   relies on inner product */
data original;
     do ID=-314 to 314;
        x1=sin(ID/100)*1+rannor(8976565)*0.1; 
        x2=cos(ID/100)*1+rannor(92654782)*0.1; 
        Class=1; output;
        x1=sin(ID/100)*2+rannor(8976565)*0.1;
        x2=cos(ID/100)*2+rannor(92654782)*0.1; 
        Class=2; output;
        x1=sin(ID/100)*3+rannor(8976565)*0.1; 
        x2=cos(ID/100)*3+rannor(92654782)*0.1; 
        Class=3; output;
     end;
run;

/*------------ Linear PCA -----------*/
proc princomp data=original  standard noint cov 
              outstat=lin_stat(where=(_TYPE_ = 'USCORE'))
              noprint;
        var x1 x2;
run;

data lin_stat_score;
     set lin_stat;
     _TYPE_= 'PARMS';
run;

proc score data=original   score=lin_stat_score  type=parms   out=lin_pca;
     var x1 x2;
run;

/*--------- Kernel PCA 1.0 ------------*/
/* Inner product based kernel PCA */
proc transpose data=original(drop=ID  Class) out=original_t; run;

proc corr data=original_t 
          outp=inner(where=(_TYPE_='SSCP' & substr(_NAME_, 1, 3)='COL') 
                     drop=intercept)  
          sscp noprint;
     var col:;
run;

/* apply kernel functon to inner product matrix. Here we use (X'Y+1)^2, the 
   one used in wikipedia.org example. */
data inner;
     set inner;
     array _C{*} _numeric_;
     _TYPE_='COV';
     do j=1 to dim(_C); _C[j]=(_C[j]+1)**2; end;
     drop j;
run;

proc princomp data=inner    noint  cov  standard
              outstat=k_stat(where=(_TYPE_ in ('EIGENVAL', 'USCORE')) )
              noprint;
     var col:;
run;

data S(drop=_NAME_)  score;
     set k_stat;
     if _TYPE_='EIGENVAL' then output S; else output score;
     drop _TYPE_;
run;

data _null_;
     set S;
     array _S{*} _numeric_;
     do j=1 to dim(_S);
        if _S[j]<0.5*constant('MACEPS') then do;
          call symput('maxsingval', j-1); stop;
        end;
     end;
run;
%put &maxsingval;

data score; 
     set score; if _n_>&maxsingval then delete; retain _TYPE_ 'PARMS';  
run;

proc score data=inner   score=score  type=parms  out=k_U(keep=Prin:);
        var col:;
run;

data k_U;
     set original(keep=ID Class  x1 x2)  ;
     set k_U;
run;

/*
proc gplot data=k_U;
     plot Prin1*Prin2=Class;
run;quit;
*/

/*--------- Kernel PCA 2.0------------*/
/* Frobenius norm based kernel PCA */
proc transpose data=original(drop=ID  Class) out=original_t; run;

proc corr data=original_t  
          outp=inner2(where=(_TYPE_='SSCP' & substr(_NAME_, 1, 3)='COL')  
                      drop=intercept)  
          sscp noprint;
     var col:;
run;

proc means data=original_t noprint ;
         var col:;
         output out=_uss2(drop=_TYPE_    _FREQ_)    uss= /autoname;
run;

proc transpose data=_uss2  out=_uss2t; run;

proc sql noprint;
          select nobs into :NCOLS from sashelp.vtable
          where libname='WORK'
              and memtype='DATA'
              and memname='_USS2T'
              ;
quit;
%let ncols=%sysfunc(compress(&NCOLS));
%put &ncols;

/* apply kernel functon to inner product matrix. Here we use Gaussian kernel function. */
%let sigma=1;
data inner2;
          array _X{&ncols} COL1-COL&ncols;
          array _USS0{&ncols}  _temporary_;
          if _n_=1 then do;
             do j=1 to &ncols;
                  set  _uss2t(keep=COL1)  point=j;
                  _USS0[j]=COL1;
             end;
          end;
          set inner2;
          _TYPE_='COV';
          do j=1 to &ncols;
              _X[j]=_USS0[_n_] + _USS0[j] - 2*_X[j];
              _X[j]=exp(-0.5*_X[j]/σ);
          end;
          keep _TYPE_  _NAME_ COL1-COL&ncols;
run;


proc princomp data=inner2    noint  cov  standard
              outstat=k_stat2(where=(_TYPE_ in ('EIGENVAL', 'USCORE')) )
              noprint;
     var col:;
run;

data S2(drop=_NAME_)  score2;
     set k_stat2;
     if _TYPE_='EIGENVAL' then output S2; else output score2;
     drop _TYPE_;
run;

data _null_;
     set S2;
     array _S{*} _numeric_;
     do j=1 to dim(_S);
        if _S[j]<0.5*constant('MACEPS') then do;
          call symput('maxsingval', j-1); stop;
        end;
     end;
run;
%put &maxsingval;

data score2; 
     set score2; if _n_>&maxsingval then delete; retain _TYPE_ 'PARMS';  
run;

proc score data=inner2   score=score2  type=parms  out=k_U2(keep=Prin:);
        var col:;
run;

data k_U2;
     set original(keep=ID Class  x1 x2)  ;
     set k_U2;
run;

/*@ Draw figure using R @*/

%macro RScript(Rscript);
data _null_;
     file "&Rscript";
     infile cards ;
     input;
     put _infile_;
%mend;

 
%macro CallR(Rscript, Rlog);
systask command "D:\Progra~1\Analyt~1\R\bin\R.exe CMD BATCH --vanilla --quiet 
                 &Rscript  &Rlog"   taskname=rjob1  wait  status=rjobstatus1;
%mend;

options nosource;
proc export data=original   outfile="c:\o.csv"  dbms=csv; run;
proc export data=lin_PCA    outfile="c:\a.csv"  dbms=csv; run;
proc export data=k_U       outfile="c:\b.csv"  dbms=csv; run;
proc export data=k_U2     outfile="c:\c.csv"   dbms=csv; run;
options source;

%RScript(c:\rscript.r)
cards;
odsn <- read.csv('c:/o.csv', header=T)
adsn <- read.csv('c:/a.csv', header=T)
bdsn <- read.csv('c:/b.csv', header=T)
cdsn <- read.csv('c:/c.csv', header=T)
png(file='c:/figure.png')
par(mfcol=c(2,2), mar=c(4,4,3,2))
plot(x1~x2, data=odsn, col=Class, main='Original')
plot(Prin1~Prin2, data=adsn, col=Class, main='Linear PCA')
plot(Prin1~Prin2, data=bdsn, col=Class, main='Polynomial Kernel')
plot(Prin1~Prin2, data=cdsn, col=Class, main='Gauss Kernal')
dev.off()
;
run;

%CallR(c:\rscript.r, c:\rlog1.txt);

Further Reading:
Chapter 1 & 2 of
Principal Manifolds for Data Visualization and Dimension Reduction by A. Gorban, B. Kegl, D. Wunsch, A. Zinovyev (Eds.) Lecture Notes in Computational Science and Engineering, Springer 2007

Principal Manifolds for Data Visualization and Dimension Reduction (Lecture Notes in Computational Science and Engineering)

Tuesday, February 02, 2010

Partial Least Square

In some predictive modelling projects, we may have variables that most of the observations have the same value, while the small percentage rest ones are populated with meaningful values. For example, 90% observations have values=0 but the rest 10% have value=1, 2, 3.....for a variable called WebHits, etc. We may have a large number of such variables, say web hits at different pages, and due to the small percentage of differently valued observations, each variable show minimal predictive power.

But we have a large number of such variables, and a quick way to figure out whether they collectively show up predictive power, we may use Partial Least Square method.

In SAS, we can get the PLS scores and score new data in this way:



%macro PLSSCORE(dsn, XCenScale, XWeights, outdsn, prefix=_xscr_);
%if &prefix eq %str(' ') %then %let prefix=_xscr_;
proc sql noprint;
     select variable into :covars separated by ' '
  from   &XCenScale;
quit;
proc transpose data=&XCenScale out=&XCenScale._t;
        id variable;
run;
data &XCenScale._t;
   if _n_=1 then _TYPE_='MEAN';
   else _TYPE_='STD';
   set &XCenSCale._t;
run;
data &XWeights;
       length _TYPE_ $ 5;
       _TYPE_='PARMS';
       set &Xweights;
      _NAME_=compress("&prefix"||_n_);
run;
data Xscore/view=Xscore;
     length _TYPE_ $ 5;
     set &XCenScale._t &XWeights;
run;
proc score data=&dsn  score=Xscore type=PARMS out=&outdsn;
      var &covars;
run;
%mend;

Lightweight yet detailed explanation of PLS and its application to data mining projects can be found at:
Pharmaceutical Statistics using SAS: A Practical Guide by Alex Dmitrienko, Christy Chuang-Stein, Ralph B. D'Agostino, SAS Publishing 2007

SAS Programming for Data Mining

Page Title