Sunday, March 28, 2010

Implement Boost Algorithm in SAS

Boost algorithms are proven to be very effective data mining tools, either used stand alone, or as a building block to handle nonlinearity, etc. Implementation of Boost algorithm in SAS is not easy to find although it is not difficult to write one yourselve. I completely rewrite the %Boost macro in the book "Pharmaceutical Statistics Using SAS: A Practical Guide" [1], which covers AdaBoost, RealBoost, GentlBoost and LogitBoost algorithms. Related %Predict macro is very straight forward to rewrite, too. Note that the author uses Gini index as impurity measurement, while in most standard implementations, a weak classifier directly minimizes weighted error rate [2].

The original macro (Found @ Here) uses SAS/IML and the way it handles computation makes it impossible to work on data set with even moderate size. For example, working on a table with 4000+ observations, the original macro will utilize >850M memory and may cause the program collapse. In many rare event study, a larger sample is necessary. For example, a fraud detection program with 0.01% fraud case may well require >100K total records for a reliable analysis. A much improved SAS/IML version can be found @ here.

New macro utilizing DATA STEP will be more tolerable to very large data set, and is much faster in this case (when your data consists less than 3000 observations, the original macro works much faster). Basically computing time and space requirement of this macro is linear in the number of observations while quadratic for original macro. The new macro will involve computing modules %gini, %stump_gini, %csss, %stump_css in this post (@ Here).

Note that if duplicate values exist, these two macros will not produce exactly the same result if different response categories appear among the duplicate valued records, because of the way these responses arranged. But both programs works in the sense that they will reduce error rate as iteration goes on.

Future development includes weak classifier directly min error rate, and using a more effecient way to handle small data set.

Reference:
[1]. Alex Dmitrienko, Christy Chuang-Stein, Ralph B. D'Agostino, "Pharmaceutical Statistics Using SAS: A Practical Guide", SAS Institute 2007;
[2]. Yoav Freund & Robert E. Schapire, "A Short Introduction to Boosting", Journal of Japanese Society for Artificial Intelligence, 14(5):771-780, September, 1999;

Friday, March 26, 2010

A Macro for SVD

SVD is at the heart of many modern machine learning algorithms. As a computing vehicle for PCA, SVD can be obtained using PROC PRINCOMP on the covariance matrix of a given matrix withou correction for intercept. With SVD, we are ready to carry out many tasks that are very useful but not readily available in SAS/STAT, such as TextMining using LSI [default algorithm used in SAS TextMiner [1]], multivariate Time Series Analysis using MSSA, Logistic-PLS, etc.

I also highly recommend the book "Principal Component Analysis 2nd Edition" by I. T. Jolliffe. Prof. Jollliffe smoothly gave a thorough review of PCA and its applications in various fields, and provided a road map for further research and reading.


%macro SVD(
           input_dsn,
           output_V,
           output_S,
           output_U,     
           input_vars,
           ID_var,
     nfac=0
           );

%local blank   para  EV  USCORE  n  pos  dsid nobs nstmt
       shownote  showsource  ;

%let shownote=%sysfunc(getoption(NOTES));
%let showsource=%sysfunc(getoption(SOURCE));
options nonotes  nosource;

%let blank=%str( );
%let EV=EIGENVAL;
%let USCORE=USCORE;

%let n=%sysfunc(countW(&input_vars));

%let dsid=%sysfunc(open(&input_dsn));
%let nobs=%sysfunc(attrn(&dsid, NOBS));
%let dsid=%sysfunc(close(&dsid));
%if  &nfac eq 0 %then %do;
     %let nstmt=␣ %let nfac=&n;
%end;     
%else %do;
     %let x=%sysfunc(notdigit(&nfac, 1)); 
  %if  &x eq 0 %then %do;
          %let nfac=%sysfunc(min(&nfac, &n));
          %let nstmt=%str(n=&nfac);
  %end;
  %else %do;
          %put ERROR: Only accept non-negative integer.;
          %goto exit;
  %end;
%end;

/* calculate U=XV/S */
%if &output_U ne %str() %then %do;
    %let outstmt=  out=&output_U.(keep=&ID_var  Prin:);
%end;
%else %do;
    %let outstmt=␣
%end;

%let options=noint cov noprint  &nstmt;

proc princomp data=&input_dsn  
             /* out=&input_dsn._score */
              &outstmt
              outstat=&input_dsn._stat(where=(_type_ in ("&USCORE", "&EV")))  &options;
     var &input_vars;
run;
data &output_S;
     set &input_dsn._stat;
     format Number 7.0;
     format EigenValue Proportion Cumulative 7.4;
     keep Number EigenValue  Proportion Cumulative;
     where _type_="&EV";
     array _X{&n} &input_vars;
     Total=sum(of &input_vars);
     Cumulative=0;
     do Number=1 to dim(_X);
     EigenValue=_X[number];
     Proportion=_X[Number]/Total;
     Cumulative=Cumulative+Proportion;  
     output;
  end;
run;

%if &output_V ne %str() %then %do;
proc transpose data=&input_dsn._stat(where=(_TYPE_="&USCORE")) 
               out=&output_V.(rename=(_NAME_=variable))
               name=_NAME_;
     var &input_vars;
     id _NAME_;
  format &input_vars 8.6;
run;
%end;

/* recompute Proportion */
%if &output_S ne %str() %then %do;
data &output_S;
     set &input_dsn._stat ;
  where _TYPE_="EIGENVAL";
  array _s{*} &input_vars;
  array _x{&nfac, 3} _temporary_; 
  Total=sum(of &input_vars, 0);
  _t=0;
  do _i=1 to &nfac;
     _x[_i, 1]=_s[_i]; _x[_i, 2]=_s[_i]/Total; 
  if _i=1 then _x[_i, 3]=_x[_i, 2]; 
  else _x[_i, 3]=_x[_i-1, 3]+_x[_i, 2];
  _t+sqrt(_x[_i, 2]);
  end;
  do _i=1 to &nfac;
     Number=_i;  
  EigenValue=_x[_i, 1]; Proportion=_x[_i, 2]; Cumulative=_x[_i, 3];
     S=sqrt(_x[_i, 2])/_t;  SinguVal=sqrt(_x[_i, 1] * &nobs);
  keep Number EigenValue  Proportion Cumulative  S SinguVal;
     output;
  end;
run;
%end;

%if &output_U ne %str() %then %do; 
data &output_U;
     array _S{&nfac}  _temporary_;  
     if _n_=1 then do;
        do j=1 to &nfac;
           set  &output_S(keep=SinguVal)  point=j;
           _S[j]=SinguVal; 
           if abs(_S[j]) < CONSTANT('MACEPS') then _S[j]=CONSTANT('BIG');
        end;
    end;
    set &output_U;
    array _A{*}  Prin1-Prin&nfac;
    do _j=1 to dim(_A);
        _A[_j]=_A[_j]/_S[_j];
    end;
    keep &ID_var Prin1-Prin&nfac ;
run;
%end;

%exit: 
options &shownote  &showsource;
%mend;

Try the following sample code to examine the results:


data td;
    input x1 x2;
cards;
2 0
0 -3
;
run;

%let input_dsn=td;
%let id_var= ;
%SVD(&input_dsn,
      output_V,
      output_S,
      output_U,     
      x1  x2,
      &id_var,
      nfac=0
      );



Reference:
[1] Albright, Russ, "Taming Text with the SVD", SAS Institute Inc., Cary, NC, available at :
http://ftp.sas.com/techsup/download/EMiner/TamingTextwiththeSVD.pdf

[2] Jolliffe, I. T. , "Principal Component Analysis", 2nd Ed., Springer Series in Statistics, 2002
Principal Component Analysis