Monday, July 12, 2010

Implement Randomized SVD in SAS

In the 2010 SASware Ballot®, a dedicated PROC for Randomized SVD was among the options. While an official SAS PROC will not be available in the immediate future as well as in older SAS releases, it is fairly simple to implement this algorithm using existing SAS/STAT procedures.

Randomized SVD will be useful for large scale, high dimension data mining problems, for instance Text Mining. In SAS/Base and SAS/STAT, lack of sparse matrix operation capability puts any serious Text Mining task at the edge of infeasibility, such as using LSI or NMF algorithms. Randomized SVD provides an economic alternate solution by sacrificing a little accuracy which is bounded under the three sampling schema proposed by the authors [1], while the code below demos sampling schema 1.



/* Randomized SVD with sampling schema 1. */
%let dim=2048;
%let nobs=1e4;
%let s=256;
data matrix;
     array _x{*} x1-x&dim;
     do id=1 to &nobs;
     do _j=1 to dim(_x); _x[_j]=sin(mod(id, _j))+rannor(id); end;
  output;
  drop _j;
  end;
run;

%let datetime_start = %sysfunc(TIME()) ;
%let time=%sysfunc(datetime(), datetime.); %put &time;
data seed;
     array _x{*} x1-x&dim;
     do _j=1 to dim(_x); _x[_j]=0; end;
  output;
  stop;
run;

proc fastclus data=matrix  seed=seed      out=norm(keep=ID DISTANCE)
              maxiter=0    maxclusters=1  noprint  replace=none;
     var x1-x&dim;
run;
data normv/ view=normv;
     set norm(keep=DISTANCE);
     DISTANCE2=DISTANCE**2;
     drop DISTANCE;
run;
proc means data=normv noprint;
     var DISTANCE2;
     output  out=matrixnorm  sum(DISTANCE2)=Frobenius_sqr;
run;
data prob;
     set matrixnorm ;
  retain Frobenius_sqr;
  do until (eof);
     set norm  end=eof;
  _rate_=DISTANCE**2/Frobenius_sqr;
  keep ID _rate_;
  output;
  end;
run;

data matrixv/view=matrixv;
     merge matrix  prob(keep=_rate_);
run;

ods select none;
proc surveyselect data=matrixv  out=matrixsamp(drop=SamplingWeight  ExpectedHits  NumberHits _rate_)  
                   sampsize=&s  method=pps_wr   outhits  ;
     size _rate_;
run;
ods select all;

proc transpose data=matrixsamp  out=matrixsamp;
     var x1-x&dim;
run;

proc princomp data=matrixsamp  outstat=testv(where=(_type_ in ("USCORE")))  
              noint  cov  noprint;
  var col1-col&s;
run;
data testV_t/view=testV_t;
     retain _TYPE_ 'PARMS';
  set testv(drop=_TYPE_);
run;     

proc score data=matrixsamp   score=testV_t  type=parms  
           out=SW(keep=ID Prin:);
    var   col1-col&s;
run;

data seed;
     array _s{*}  prin1-prin&s;
  do _j=1 to dim(_s); _s[_j]=0; end;
  drop _j; output; stop;
run;

proc fastclus data=SW   seed=seed  maxiter=0  maxc=1  replace=NONE   out=SW2(drop=CLUSTER)  noprint;
     var prin1-prin&s;
run;

data HHT;
     set SW2;
  array _x{*}  prin1-prin&s;
  do _j=1 to dim(_x); _x[_j]=(_x[_j]/distance)**2; end;
  drop _j  distance;
run;

proc transpose data=HHT  out=HHT2(drop=_LABEL_);      
run;
data HHT2; 
     _TYPE_='PARMS';
     set HHT2; 
  rename COL1-COL2048=x1-X2048;
run;

proc score data=matrix   score=HHT2  type=parms  out=P(drop=x:);
     var x:;
run;     

%let time=%sysfunc(datetime(), datetime.); %put &time;
%put PROCESSING TIME:  %sysfunc(putn(%sysevalf(%sysfunc(TIME())-&datetime_start.),mmss.)) (mm:ss) ;
options notes source;


Reference:
[1], P. Drineas and M. W. Mahoney, "Randomized Algorithms for Matrices and Massive Data Sets", Proc. of the 32nd Annual Conference on Very Large Data Bases (VLDB), p. 1269, 2006.

0 comments: