Monday, August 09, 2010

VARIMAX rotation of PLS loadings

Partial Least Square is one of several supervised dimension reduction techniques and attracts attention in recent years. In the one hand, PLS is able to generate a series of scores that maximize linear correlation between dependent variables and independent variables, on the other hand, the loading of PLS can be regarded as similar counterpart from factor analysis, hence we can rotate the loadings from PLS therefore eliminate some of the non-significant variable in terms of prediction.


%macro PLSRotate(Loading, TransMat, PatternOut, PatternShort, 
                 method=VARIMAX, threshold=0.25);
/* VARIMAX rotation of PLS loadings. Only variables having 
   large loadings after rotation will enter the final model. 

   Loading dataset contains XLoadings output from PROC PLS 
   and should have variable called NumberOfFactors
   TransMat is the generated Transformation matrix;
   PatternOut is the output Pattern after rotation;
   PatternShort is the output Pattern with selected variables
*/

%local covars;
proc sql noprint;
     select name into :covars separated by ' '
  from   sashelp.vcolumn
  where  libname="WORK" & memname=upcase("&Loading") 
        &   upcase(name) NE "NUMBEROFFACTORS"
  &   type="num"
  ;
quit;
%put &covars;

data &Loading.(type=factor);
         set &Loading;
         _TYPE_='PATTERN';
         _NAME_=compress('factor'||_n_);
run;
ods select none;
ods output OrthRotFactPat=&PatternOut;
ods output OrthTrans=&TransMat; 
proc factor  data=&Loading   method=pattern  rotate=&method  simple; 
         var &covars;
run;
ods select all;

data &PatternShort;
     set &PatternOut;
  array _f{*} factor:;
  _cntfac=0;
  do _j=1 to dim(_f);  
        _f[_j]=_f[_j]*(abs(_f[_j])>&threshold); _cntfac+(_f[_j]>0); 
     end;
  if _cntfac>0 then output;
  drop _cntfac _j;
run;
%mend;

Here I try to replicate the case study in [1] which elaborated how to do and properties of VARIMAX rotation to PLS loadings. The PROC PLS output, after various tweaks on convergence criteria and singularity conditions, is still a little different from the result reported in [1] for factors other than the leading one, therefore, I will directly use the U=PS matrix in pp.215.



data loading;
input factor1-factor3;
cards;
-0.9280  -0.0481  0.2750
0.0563  -0.8833  0.5306
-0.9296  -0.0450  0.2720
-0.7534  0.1705  -0.5945
0.5917  -0.0251  -0.6450
0.9082  0.3345    0.1118
-0.8086  0.4551  -0.3800
;
run;


proc transpose data=loading  out=loading2;
run;

data loading2(type=factor);
     retain _TYPE_ "PATTERN";
  set loading2;
run;


ods select none;
ods output OrthRotFactPat=OrthRotationOut;
ods output OrthTrans=OrthTrans; 
proc factor  data=Loading2   method=pattern  rotate=varimax  simple; 
         var col1-col7;
run;
ods select all;


Reference:
[1] Huiwen Wang; Qiang Liu , Yongping Tu, "Interpretation of PLS Regression Models with VARIMAX Rotation", Computational Statistics and Data Analysis, Vol.48 (2005) pp207 – 219

Wednesday, August 04, 2010

Table Look Up in SAS, practical problems

One guy asked in a SAS forum about a typical table look up problem:
He has a data with two IDs:
id1 id2
a b
a e
b c
b e
c e
d e

and he wants to generate a new data set with the following structure according to above information :
id a b c d e
a 0 1 0 0 1
b 1 0 1 0 1
c 0 1 0 0 1
d 0 0 0 0 1
e 1 1 1 1 0

The real data is potentially big.
***************************;
At first look, this is a typical table look up problem SAS programmers facing almost everyday, that is duplicate keyed lookup table. It is a simple one because there is no inherent relationship among records.


data original;
   input id1 $ id2 $;
datalines;
a b
a e
b c
b e
c e
d e
;
run;

proc datasets library=work nolist;
     modify original;
  index create id1 id2;
quit;

proc sql;
     create table all_cases as
  select a.*, monotonic() as seq
  from (
  select distinct id1 as id
  from original
  union
  select distinct id2 as id
  from original
  ) as a
  order by a.id
  ;
quit;

proc sql noprint;
     select id into :idnames separated by ' '
  from   all_cases
  ;
quit;

data new;
  if _n_=1 then do;
     declare hash _h(dataset:'all_cases');
     _h.defineKey('id');
     _h.defineData('seq');
     _h.defineDone();
     end; 
     set all_cases;

  array _a{*} &idnames; 

  id1=id;  
  set original key=id1;      
  _mx_=%sysrc(_sok);
  
  do while (_iorc_=%sysrc(_sok));   
     rc=_h.find(key:id2); if rc=0 then _a[seq]=1;
  id1=id;
     set original key=id1;  
  
  end;
  _ERROR_=0;
  
  id2=id;  
  set original key=id2;      
  do while (_iorc_=%sysrc(_sok)); 
     rc=_h.find(key:id1); if rc=0 then _a[seq]=1;
  id2=id;
     set original key=id2;  
  end;
  _ERROR_=0;
  do j=1 to dim(_a); _a[j]=max(0, _a[j]); end;
  keep id &idnames;
run;

On the other hand, this problem can be solved in a more SASsy way like this:


data original;
   input id1 $ id2 $;
datalines;
a b
a e
b c
b e
c e
d e
;
run;

proc sql;
     create table newx as
     select a.id1, a.id2, (sum(a.id1=c.id1 & a.id2=c.id2)>0) as count
     from   
       (select a.id as id1, b.id as id2 
        from all_cases as a, all_cases as b) as a
left join   original as c
       on   a.id1=c.id1 or a.id2=c.id1
    group by a.id1, a.id2
    ;
quit;

proc transpose data=newx  out=_freq_t name=id2;
     by id1;
     var count;
     id id2;
run;

data _freq_t;
     set _freq_t;
     array _n{*} _numeric_;
     do i=1 to dim(_n);
        _n[i]=(_n[i]>0);
     end;
     drop i;
run;

proc transpose data=_freq_t(drop=id2) out=_freq_t2  name=id1;
     id id1;
run;

proc sql noprint;
     select id1, count(distinct id1) into :covars separated by ' ', :count
     from   _freq_t;  
quit;

data new2;
     set _freq_t;
     array _x{*} &covars;
     array _x2{&count} _temporary_;

     do j=1 to &count; _x2[j]=_x[j]; end;
     set _freq_t2;
     do j=1 to &count; _x[j]=(_x[j]+_x2[j]>0); end;
     drop j id2;
run;