Monday, December 15, 2014

Experient downdating algorithm for Leave-One-Out CV in RDA

In this post, I want to demonstrate a piece of experiment code for downdating algorithm for Leave-One-Out (LOO) Cross Validation in Regularized Discriminant Analysis [1].

In LOO CV, the program needs to calculate the inverse of \(\hat{\Sigma}_{k\v}(\lambda, \gamma)\) for each observation v to obtain its new scoring function at a given pair of regularizing pair parameter \( (\lambda, \gamma) \), where \(k\v\) indicates class \(k\) excluding observation \(v\).

As mentioned in [1], to obtain the inverse of this variance covariance matrix for class k, it is not sufficient to just remove observation \(v\) and recalculate it, but instead, we try to calculate $$W_{k\v}(\lambda, \gamma)\hat{\Sigma}^{-1}_{k\\v}(\lambda, \gamma)$$, which is equivalent to compute the inverse of :

\[
  W_k(\lambda)\hat{\Sigma}^{-1}_k (\lambda) - \gamma \|Z_v\|^2  \cdot  I - (1-\gamma)Z_v Z_v^{t} \]

In this formula, the first element \(W_k(\lambda)\hat{\Sigma}^{-1}_k (\lambda) \) does not involve quantities varying along with observation \(v\) and can be pre-computed. The latter two can be easily updated on the fly as we scan through each individual observations.

With this in mind, we use the iris data for demonstration. Note that the SAS code here is for illustrating the concept of downdating algorithm above for a given pair of \( (\lambda, \gamma) \). The computing is divided into several steps. (1) We first use PROC DISCRIM to calculate CSSCP matrix, total weight and variable mean by class. These quantities correspond to \( \Sigma_k \), \( \bar{X_k}\) and \( W_k\) in the original paper. (2) We then use the %SVD Macro @here to obtain key elements for later updating.

Note that in order to obtain CSSCP, MEAN and total weights, in addition to PROC DISCRIM, we can also use PROC CORR. The advantage is that we can keep all computing within the framework of SAS/Base so to benefit budget-sensitive business and maximize compatibility. The down side is that the data set need to sorted or indexed to obtain CSSCP by class and the data has to be passed twice, one for pooled CSSCP and the other for class specific CSSCP.

(to be completed...)
 

/* study downdating algorithm */
%let lambda = 0.3;
%let gamma = 0.6;
%let target = Species;
%let covars = SepalLength SepalWidth  PetalLength  PetalWidth;

/* step 0. Obtain key statistics from DISCRIM */
proc discrim data=iris  outstat=stat noprint  noclassify;
      class ⌖
   var   &covars;
run;

/* step 1. Obtain \Sigma_k(\lambda) & \Sigma_k(\lambda, \gamma)
           \S_k = (_TYPE_="CSSCP' & &target = "something")
           \S  = (_TYPE_="CSSCP" & &target = " ")
           \W_k = (_TYPE_="N" & &target = "something") -1
           \W  = (_TYPE_="N" & &target = " ") - &nClass + 1
           \Sigma_k(\lambda) = \S_k(\lambda) / \W_k(\lambda)
              where:
                    \S_k(\lambda) = (1-\lambda)*S_k + \lambda*S
                    \W_k(\lambda) = (1-\lambda)*W_k + \lambda*W

          \Sigma_k(\lambda, \gamma) = (1-\lambda)*\Sigma_k(\lambda) + \gamma* trace[\Sigma_k(\lambda)]/p*I         
*/ 
proc sql noprint;
     select distinct &target into :targetvalues separated by " "
  from   stat
  where _TYPE_="CSSCP"
  ;
  select distinct _NAME_ into :covars separated by " "
  from   stat
  where _TYPE_="CSSCP"
  ;
quit;
%put &targetvalues;
%put &covars;

%let nClass=%sysfunc(countw("&targetvalues"));
%put &nClass;
%let nVars=%sysfunc(countw("&covars"));
%put &nVars;

data weights;
     set stat;
  where _TYPE_="N";
  array _nv{*} _numeric_;
  if &target="" then _total=_nv[1]-&nClass+1;
  retain lambda λ
  retain _total;
  weight=(1-&lambda) * (_nv[1]-1) + &lambda*_total;  
  keep &target   weight  lambda;
  if &target ^= "";
run;

proc sort data=stat  out=sigma;
      by _TYPE_  ⌖
   WHERE _TYPE_ = "CSSCP" & &target ^= " ";
run;
proc sort data=weights;
     by  ⌖
run;

/*-  step 1.1 Update \Sigma_k(\lambda) = ((1-\lambda)*S_k + \lambda*S) / ( (1-\lambda)*W_k + \lambda*W )  -*/
                                                                                          
data sigma;
     array _sigma{&nVars, &nVars} _temporary_;
  array _x{&nVars} &covars;
  array _y{&nClass} _temporary_;  
  if _n_=1 then do;
     _iter=1;
     do until (eof1);
        set stat(where=(_TYPE_="CSSCP" & &target = " "))  end=eof1;
     do _j=1 to &nVars;
        _sigma[_iter, _j]=_x[_j];
     end;
     _iter+1;
     end; 
        
     _iter=1; 
     do until (eof2);
     set weights end=eof2;
        _y[_iter]=weight;
     _iter+1;
  end;
  put _y[3]=;
  
  _target=⌖
  _iter=0;
  end;
  modify sigma ;  
  retain  weight;
  retain  _target;
  _TYPE_="COV";
 
  if &target^=_target then do;
        _iter+1;
        _i=0; 
  weight=_y[_iter];
  _target=⌖
  end;
  _i+1;
  put "&covars";
  put &covars;
  do _j=1 to &nVars;
     _x[_j]= ((1-&lambda)*_x[_j] + &lambda*_sigma[_i, _j])  ;
  end;
  put &covars;
  put "-----------------------------------------------------------------";
  replace;  
run;

/*- trace is sum of every elemnt of a matrix -*/
data _trace;
     set sigma;
  by ⌖
  array _x{*} &covars;
  retain trace _iter;
  if first.&target then do;
     _iter = 1;
  trace=0;
  end;
  trace+_x[_iter];
  _iter+1;
  if last.&target then output;
  keep &target  trace;
run;


/*- step 1.2 Update \Sigma_k (\lambda, \gamma) = (1-\gamma)*\Sigma_k(\lambda) + \gamma *c*I -*/
/*- where  c = trace(\Sigma_k(\lambda)) / p, where p=&nVars                             -*/
data sigma;
      if _n_=1 then do;
      declare hash h(dataset:'_trace');
   h.defineKey("&target");
   h.defineData("trace");
   h.defineDone(); 
   call missing(trace);
   end;
   set sigma; by ⌖
   array _x{*} &covars;
   retain _adj  trace _iter;
   if first.&target then do;
      _iter=1;
   end;
   if _iter=1 then do;
         _rc=h.find();
         _adj= &gamma/&nVars * trace;
   end;
   put _iter=  &target=;
   do _j=1 to &nVars;
      if _j=_iter then do;
      _x[_iter] = (1-&gamma)*_x[_iter] + _adj;
   end;
   else do;
            _x[_iter] = (1-&gamma)*_x[_iter];
   end;
   end;
   _iter=_iter+1;
   drop _type_   _adj _iter _j  _rc  trace;
run;



/*-step 1.3 Obtain the inverse of W_k \Sigma_k(\lambda, \gamma) - c*I  using SVD                                     -*/  
/*-This inverse is the same as first inverse \Sigma_k(\lambda, \gamma), then with S matrix deflated by W_k(\lambda)  -*/
/*-and differenced by c, where c=\gamma* |Z_v|^2/p, and Z_v = sqrt(bk(v))*(X_v- mean(X_k\v),                         -*/
/*-bk(v)=sk(v)*Wc(v) wv/(Wc(v)-wv)                                                                                   -*/
/*-v is the current observation                                                                                      -*/

/*-       1. apply SVD to obtain eigenvalues   
%SVD(sigma, out_u, out_S, out_V)                                                                                     -*/


/*-       2. The whole CV will be done in one Data Step
             Because all related quantitaties of the rest, such as such as |Z_v|, sk(v), Wc(v) 
             will require at least one pass through of the original data, therefore it is
             best to incorporate all computing in this step.
-*/
data _iris;
     array _m{*} &covars;
     array _mt{&nVars} _temporary_;
     if _n_=1 then do;
     _class=1;
        do until eof;
     set stat(where=(_TYPE_='MEAN'& &target^=""))  end=eof;
     do _j=1 to dim(_m);
        _mt[_class, _j]=_m[_j];
     end;
     _class+1;
  end;
  end;
  array _rank1{&nVars, &nVars} _temporary_;
  set iris point=1;  
  do _j=1 to dim(_m);
     _m[_j]=_m[_j]-_mt[1, _j];
  end;
  do _j=1 to &nVars;
     do _k=1 to &nVars;
           _rank[_j, _k]=_m[_j] * _[_k];

           /*  MORE CODE TO COME*/

     end;
  end;
run;

[1] J. H. Friedman. Regularized discriminant analysis. Journal of the American Statistical Association, 84(405):165–175, 1989.

No comments: