SAS Programming for Data Mining: December 2009

Saturday, December 12, 2009

Clustering Handwirtten Digits (digitalized optical images)

In this example, we show the a clustering exercise on Optical Recognition of Handwritten Digits Data Set available @ UCI data set repository (Link).

This exercise is a standard application of HOSVD by stacking 8X8 matrix of digitalized bitmap of each letter where 1-mode is digits for each column and 2-mode corresponds to rows while 3-mode is each letter object, a standard Tensor representation. U_(3) from HOSVD of this Tensor gives indication information for clustering. In this exercise, we apply k-means algorithm on the correlation matrix of each row of U_(3), which corresponds to each subject, with each column of right eigenvector matrix to eliminate outlier effect and stablize the matrix for final clustering algorithm.

The clustering result, together with original unclustered correlation data and correlation data sorted by true letter ID, is shown below. In order to facilitate display, we show the confusion matrix of the correlation matrix so that the clustering result is more obvious to identify.

Cross Tabulation of True Class (0-9) and Clustering Result (1-10):

Further thoughts. The confusion matrix display shows that while the letter clusters have shown up but the pattern is not as prominent as we expected. However, it is possible to obtain better result by applying Nonnegative Matrix Factorization (NMF) to the square of correlation confusion matrix. The NMF algorithm will be implemented in SAS soon.

Note: All numerical analyses are done using the prototype code demonstrated at this Blog Entry (Link). This figure is drawn in R.

【 Clustering Result shown in Confusion Matrix style 】

Reference:
Matrix Methods in Data Mining and Pattern Recognition by Lars Eldén

Matrix Methods in Data Mining and Pattern Recognition (Fundamentals of Algorithms)

Wednesday, December 09, 2009

Tensor in SAS

Tensor, a math concept for high order array, is a very useful tool in modern data mining applications. HOSVD, the counter part of SVD in higher order array, is at the heart of modern applications, such as face recognition and clustering, segmentation of multi-dimensional transaction data, etc.

【"Matrix Method in Data Mining and Pattern Recognition" by Lars Eldén】

But implementation of math operations designed for Tensor is not easy, not to mention doing that in SAS. Due to the design nature of SAS data set, everything has to be done in a 1-D or 2-D matrix fashion. Luckily, Tensor operations have their low dimension matrix counterpart, only that matrix subscriptions need to be calculated carefully.

I spent recent couple of days to implement some featured Tensor operations, notably Tensor unfolding (Tensor to Matrix) and HOSVD for my research and work. A prototype code for 3-mode Tensor unfolding and HOSVD SAS code is shown below. Data is from Lathauwer et al., "A MULTILINEAR SINGULAR VALUE DECOMPOSITION", SIAM J. MATRIX ANAL. APPL. vol.21, No.4



data A1;
     input X1-X3;
datalines;
 0.9073  0.8924  2.1488
 0.7158 -0.4898  0.3054
-0.3698  2.4288  2.3753
 1.7842  1.7753  4.2495
 1.6970 -1.5077  0.3207
 0.0151  4.0337  4.7146
 2.1236 -0.6631  1.8260
-0.0740  1.9103  2.1335
 1.4420 -1.7495 -0.2716
;
run;

filename Acat  catalog 'work.tensor.A2.CATAMS';

data _null_;
     file Acat;
     mode='1;';
     dimension='3,3,3;';
     put mode dimension;
run;

%let _dims=2 3 4;
%let _totalmode=3;
%let _mode0=1;
%let _modex=2;
%let _dimension2=6;
%let _dimension1=4;

data _null_;
      infile A2Cat;
      input;  
      _index=trim(scan(_infile_, 1, ';')); put _index=;
      _dimension=trim(scan(_infile_, 2, ';')); put _dimension=;       
      d2=count((_dimension), ',');  put d2=;
      call symput('_totalmode', d2+1);   
      call symput('_dims', translate(_dimension, ' ', ','));
      _dim0=scan(_dimension, &_mode0, ',');
      _dimx=scan(_dimension, &_modex, ',');
      call symput('_dim0', _dim0);
      call symput('_dimx', _dimx);
      call symput('_dim0x', compress(max(_dim0, _dimx)));
      dim2=1; dim1=1;
      do j=1 to d2+1;      
         if j=&_modex then dim1=dim1*scan(_dimension, j, ',');
         else dim2=dim2*scan(_dimension, j, ',');         
      end;
      put dim1= dim2= dim20=;
      call symput('_dimension1', compress(dim1));
      call symput('_dimension2', compress(dim2));   
run;
%put &_totalmode  &_dim0x;
%put &_dimension1  &_dimension2;
%put &_dims;

data  A&_modex;
      array _dim{&_totalmode} _temporary_ (&_dims);
      array _v{&_totalmode} _temporary_;
      array _datavec{&_dim0x} X1-X&_dim0x;
      array _X{&_dimension2, &_dimension1} _temporary_;
      set A&_mode0  end=eof;
      m1=mod(&_mode0+1, &_totalmode); if m1=0 then m1=&_totalmode;
      m2=mod(&_mode0+2, &_totalmode); if m2=0 then m2=&_totalmode;
      _v[m2]=mod(_n_, _dim[m2]); if _v[m2]=0 then _v[m2]=_dim[m2];
      _v[m1]=floor((_n_-_v[m2])/_dim[m2])+1;
      do j=1 to &_dim0;
         _v[&_mode0]=j;
         m1=mod(&_modex+1, &_totalmode); if m1=0 then m1=&_totalmode;
         m2=mod(&_modex+2, &_totalmode); if m2=0 then m2=&_totalmode;  
         _m=_v[&_modex];
         _n=(_v[m1]-1)*_dim[m2]+_v[m2];
         _X[_n, _m]=_datavec[j];
      end;
      if eof then do;
         do j=1 to &_dimension2;
            do k=1 to &_dimension1;
               _datavec[k]=_X[j, k];
            end;
            keep X1-X&_dim0x;
            output;
         end;
   end; 
run;

filename Acat  catalog "work.tensor.A&_modex..CATAMS";
data _null_;
     file Acat;
     mode="&_modex;";
     dimension="3,3,3;";
     put mode dimension;
run;

ods select none;
proc princomp data=A1  outstat=stat_1 noint  cov;
     var X1-X3;
run;
proc princomp data=A2  outstat=stat_2 noint  cov;
     var X1-X3;
run;
proc princomp data=A3  outstat=stat_3 noint  cov;
     var X1-X3;
run;
ods select all;

/***************************************
In order to obtain Tensor core S, we can 
do something like this:
S_(1)=t(U1)%*%A_(1)%*%(kronecker(U2, U3)
where S_(1) is 1-mode matrix of S and A_(1) 
is 1-mode matrix of A
****************************************/

%macro kroneck(mat_A, mat_B, mat_kronecker);
%local dsid row_B col_B  col_A;

%let dsid=%sysfunc(open(&mat_B));
%let row_B=%sysfunc(attrn(&dsid, nobs));
%let col_B=%sysfunc(attrn(&dsid, nvars));
%let dsid=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(&mat_A));
%let col_A=%sysfunc(attrn(&dsid, nvars));
%let dsid=%sysfunc(close(&dsid));

data _null_;
     col_K=&col_A*&col_B;
     call symput('col_K', compress(col_K));
run;
%put &col_A  &col_B  &col_K &row_B;

proc sql noprint;
     select name into :vars_A separated by ' '
  from   sashelp.vcolumn
  where  libname='WORK' and memname=%upcase("&mat_A")
  ;
     select name into :vars_B separated by ' '
  from   sashelp.vcolumn
  where  libname='WORK' and memname=%upcase("&mat_B")
  ;
quit;

data &mat_kronecker;
     array _K{&col_K} K1-K&col_K;
  array _A{&col_A} &vars_A;
  array _B{&col_B} &vars_B;
  array _Atmp{&col_A} _temporary_;
     array _Btmp{&col_B} _temporary_;

  do n1=1 to na;  
     set &mat_A  nobs=na point=n1;
  do j=1 to dim(_A);
           _Atmp[j]=_A[j];
  end;
     do n=1 to nb;
           set &mat_B nobs=nb point=n;
     do j=1 to dim(_B);
              _Btmp[j]=_B[j];
     end;
     do j=1 to &col_K;
        _ib=mod(j, &col_B); if _ib=0 then _ib=&col_B;
        _ia=floor((j-_ib)/&col_B)+1;
     *put _ia= _ib=  j= _A[_ia]= _B[_ib]=;
              _K[j]=_Atmp[_ia]*_Btmp[_ib];
     end;
     keep K1-K&col_K;
     output;
     end;
  end;
  stop;
run;

%mend kronecker;

options mprint  mlogic;
%kroneck(B, A, C);

data U1t(keep=X:);
     set stat_1;
     where _type_='USCORE';
run;
data U2t(keep=X:);
     set stat_2;
     where _type_='USCORE';
run;
data U3t(keep=X:);
     set stat_3;
     where _type_='USCORE';
run;

options nomprint  nomlogic;
%kroneck(U2t, U3t, U23t);

data A1score;
     set A1;
     retain _TYPE_ 'PARMS'; 
     _NAME_=compress('K'||_n_);
run;

proc score data=u1t  score=A1score out=u1A1(keep=K:) type=parms;
     var X1-X3;
run;
data u1A1;
     set u1A1;
     retain _TYPE_ 'PARMS'; 
     _NAME_=compress('X'||_n_);
run;

proc score data=u23t  score=u1A1  out=S1(keep=X:) type=parms;
     var K1-K9;
run;

Reference:
Matrix Methods in Data Mining and Pattern Recognition by Lars Eldén

Tuesday, December 01, 2009

Applying Delta Method in SAS

Delta method is a method in statistics for deriving an approximate probability distribution for a function of an asymptotically normally distributed random variable from knowledge of the limiting variance of that random variable.
See "http://en.wikipedia.org/wiki/Delta_method".

In order to apply delta method, it is necessary to obtain the first order derivatives of the function with respect to the randome variables, which may require either symbolic or finite difference approximation computing power. SAS doesn’t explicitly have symbolic computing power nor does it ouptut intermediate result from finite approximation computations and we have to turn to PROC MODEL in ETS (or PROC NLIN in SAS/STAT) to obtain the symbolic derivatives of a given function F(.). PROC MODEL output three pieces of related information:

1. Internal machine code for computing each part of the derivatives, say g(b). This is the table called CodeList;
2. Summary symbolic result for first order derivatives of f(b), i.e. the complete symbolic expression of g(b); This is the table called ProgList;
3. Indicator of partial derivatives, this is the table called Derivatives

The code list is the human-readable code for internal computing and I use this to guide further computing on the derivatives. I also use the ProgList in the first step to obtain the expression of first order derivatives of f(b), i.e. expression of g(b).

Suppose you have a translog model, called F(p1, p2, p3; b1, b2, b3), where p1, p2, p3 are input prices and b1, b2, b3 are parameter vectors (parameters should be more in real translog model). The elasticity of substitution of input 1 and input2 is defined as [@F()/@p1]/[@F()/@p2]=g(b1, b2). *NOTE: @ indicates derivative operator;

In my first step, I use PROC MODEL on F(p1, p2, p3; b1, b2, b3) to estimate parameter vector {b1, b2, b3}; In this step, we obtain tables CodeList_0 and ProgList_0;

In the second step, I tweak PROC MODEL to obtain the partial derivetives: @F()/@p1, and @F()/@p2. Then, I read in ProgList table to form the explicit AUES expression g(p1, p2| b1, b2, b3)= [@F()/@p1]/[@F()/@p2]; This step generates tables CodeList, ProgList;

Since delta method based approach to estimate V(g) relies on the first order derivative of function g() w.r.t parameter vector (b1, b2, b3), we also need to obtain the partial derivatives of g() with respect to each of the parameters:, ie. @g()/@b1, @g()/@b2, @g()/@b3. Therefore, I have to call PROC MODEL once again to obtain these formula. Tables CodeList_g and ProgList_g were generated in this step.

Later, I parse the instructions in CodeList_g to calculate @g()/@b1, @g()/@b2, @g()/@b3 at every data point of the sample. Once we obtain these values, we can use delta method to get the asymptotical mean and variance of g() function

V[g()]=[@g()/@b1, @g()/@b2, @g()/@b3]’ %*% V(b1, b2, b3) %*% [@g()/@b1, @g()/@b2, @g()/@b3]

In summary, my program only ask user to input the original model for estimation as well as the two input prices for cross elasticity calculation, users don’t need to specify the derivative functions themselves. If we use IML, we have to specify explicit formula for vector [@g()/@b1, @g()/@b2, @g()/@b3], which may very tedious to calculate.

Below is sample code using data from Econometrics Analysis of W.Greene, 6th Ed.


data Green;
input Year   Cost    K        L      E       M        Pk      Pl      Pe     Pm;
datalines;
1947  182.373 0.05107 0.24727 0.04253 0.65913 1.00000 1.00000 1.00000 1.00000
1948  183.161 0.05817 0.27716 0.05127 0.61340 1.00270 1.15457 1.30258 1.05525
1949  186.533 0.04602 0.25911 0.05075 0.64411 0.74371 1.15584 1.19663 1.06625
1950  221.710 0.04991 0.24794 0.04606 0.65609 0.92497 1.23535 1.12442 1.12430
1951  255.945 0.05039 0.25487 0.04482 0.64992 1.04877 1.33784 1.25179 1.21694
1952  264.699 0.04916 0.26655 0.04460 0.63969 0.99744 1.37949 1.27919 1.19961
1953  291.160 0.04728 0.26832 0.04369 0.64071 1.00653 1.43458 1.27505 1.19044
1954  274.457 0.05635 0.27167 0.04787 0.62411 1.08757 1.45362 1.30356 1.20612
1955  308.908 0.05258 0.26465 0.04517 0.63760 1.10315 1.51120 1.34277 1.23835
1956  328.286 0.04604 0.26880 0.04576 0.63940 0.99606 1.58186 1.37154 1.29336
1957  338.633 0.05033 0.27184 0.04820 0.62962 1.06321 1.64641 1.38010 1.30703
1958  323.318 0.06015 0.27283 0.04836 0.61886 1.15619 1.67389 1.39338 1.32699
1959  358.435 0.06185 0.27303 0.04563 0.61948 1.30758 1.73430 1.36756 1.30774
1960  366.251 0.05788 0.27738 0.04585 0.61889 1.25413 1.78280 1.38025 1.33946
1961  366.162 0.05903 0.27839 0.04640 0.61617 1.26328 1.81977 1.37630 1.34319
1962  390.668 0.05578 0.28280 0.04530 0.61613 1.26525 1.88531 1.37689 1.34745
1963  412.188 0.05601 0.27968 0.04470 0.61962 1.32294 1.93379 1.34737 1.33143
1964  433.768 0.05452 0.28343 0.04392 0.61814 1.32798 2.00998 1.38969 1.35197
1965  474.969 0.05467 0.27996 0.04114 0.62423 1.40659 2.05539 1.38635 1.37542
1966  521.291 0.05460 0.28363 0.04014 0.62163 1.45100 2.13441 1.40102 1.41878
1967  540.941 0.05443 0.28646 0.04074 0.61837 1.38617 2.20616 1.39197 1.42428
1968  585.447 0.05758 0.28883 0.03971 0.61388 1.49901 2.33869 1.43388 1.43481
1969  630.450 0.05410 0.29031 0.03963 0.61597 1.44957 2.46412 1.46481 1.53356
1970  623.466 0.05255 0.29755 0.04348 0.60642 1.32464 2.60532 1.45907 1.54758
1971  658.235 0.04675 0.28905 0.04479 0.61940 1.20177 2.76025 1.64689 1.54978
;
run;

data translog;
     set green;
  l_C=log(cost);
     lpk = log(pk); 
     lpl = log(pl); 
     lpe = log(pe); 
  lpm = log(pm);
run;

/* Specify the prices used in elasticity measurements */
%let covar1=lpk;
%let covar2=lpl;
/* Specify the translog cost function. It can be a nonlinear function, too */
%let model = %bquote(l_c=b0 + bk*lpk + bl*lpl + be*lpe + bm*lpm +
            bkk*0.5*lpk*lpk + bll*0.5*lpl*lpl +
      bee*0.5*lpe*lpe + bmm*0.5*lpm*lpm +
      bkl*lpk*lpl + bke*lpk*lpe + bkm*lpk*lpm +
      ble*lpl*lpe + blm*lpl*lpm + bem*lpe*lpm + exp(lpk););

/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
/*                          do not change code below                    */
/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
/* Estimate original model */
ods select none ;
ods output FirstDerivatives=Derivatives_0;
ods output CodeList=CodeList_0;
ods output ProgList=ProgList_0;
ods output ParameterEstimates=Est;
ods output CovB=CovB;
proc model data=Translog ListCode ListDer  covb ;
     *endogenous K L E; 
     *parms  b0 bk bl be bm
               bkk bll bee bmm
               bkl bke bkm
               ble blm
               bem;     
      &model;
      *K = bk + bkk*lpk + bkl*lpl + bke*lpe + bkm*lpm; 
      *L = bl + bll*lpk + bkl*lpk + ble*lpe + blm*lpm;
   *E = be + bee*lpe + bke*lpk + ble*lpl + bem*lpm;
      fit   l_c / outsused = smatrix  ols  outcov  outest=beta ; 
   *estimate '&PRED.K/@bkk' exp(lpk*bkk);  
   run;
quit;     
ods select all;


proc sql noprint;
     select wrt into :para_names separated by ' '
  from   derivatives_0
  ;
quit;
%put ¶_names;

data covb;
        length _TYPE_ $ 5;
        set covb;  
  _TYPE_='PARMS';
  rename Parameter=_NAME_;
run;
 

proc transpose data=Est out=Est_t name=Parameter;
     var Estimate;
  id Parameter;
run;

%let elastic_covars=lpk lpl;
data _new;
     set Translog(drop=&elastic_covars); set Est_t;  
run;


/* Obtain elasticity measurements */
ods select none ;
options error=0;
ods output FirstDerivatives=Derivatives;
ods output CodeList=CodeList;
ods output ProgList=ProgList;
proc model data=_new ListCode ListDer  maxiter=0;
      *endogenous Sk Sl Se; 
      &model;
      fit  l_c /  ols;    
   run;
quit;     
ods select all;

/* obtain analytical derivatives w.r.t input prices */

data _null_;
     length _temp_ $ 1024;
     set ProgList;
  if indexc(source, '@')>0 then do;
     _temp_='';
  x1=indexc(source, '/')+2; x2=indexc(source, '=')-1;
  name=substr(source, x1, x2-x1); put name=;
     _temp_=cats(_temp_, compress(substr(source, find(source, '=')+1), '; '));  
  j=_n_+1;  
     do while( indexc(source, ';')=0);
           set ProgList point=j ;
           _temp_=cats(_temp_, compress(substr(source, find(source, '=')+1), '; '));  
     j+1;
  end;
  call symput(compress('foc_'||name), _temp_);
      end;
   put _temp_=;
run;

%put %bquote((&&foc_&covar1)/(&&foc_&covar2));


/* Obtain derivatives of elasticity measurements w.r.t parameters */
data _new;
     set Translog;
     if _n_>1 then stop; 
run;
ods select none;
ods output FirstDerivatives=Derivatives_g;
ods output CodeList=CodeList_g;
ods output ProgList=ProgList_g;
proc model data=_new ListCode ListDer  maxiter=0;     
      *parms  b0 bk bl be bm
                bkk bll bee bmm
                bkl bke bkm
                ble blm
                bem;     
   nom=&&foc_&covar1;
   denom=&&foc_&covar2;
      l_c= nom / denom;
      fit  l_c / ols;    
   run;
quit;     
ods select all;
/*
proc sql noprint;
     select name into :para_names separated by ' '
  from   sashelp.vcolumn
  where  varnum>=2 
         &  libname='WORK' 
         &  memname='EST_T'
  ;
quit;
%put ¶_names;
*/

data _null_;
     set CodeList_g end=eof;
  if _n_=1 then do;
     call execute('data compute;');
  call execute('if _n_=1 then set Est_T;');
  call execute('set translog;');
  call execute('retain ¶_names ;');
  /*call execute('lpk = log(pk); lpl = log(pl); lpe = log(pe);' );*/

  declare hash h();
  length word $ 32 wordtrans $ 32;
  h.defineKey('word');
  h.defineData('word', 'wordtrans');
  h.defineDone();
  call missing(word, wordtrans);

  declare hiter hi('h');
  end;
  /* check if it is a Operation item */
  where index(col1, 'Oper')>0;
  if index(col1, 'eeocf')>0 then col1=tranwrd(col1, '=', 'eeocf');
     j0=index(col3, '@'); j1=indexc(col3, ':')+1; j2=indexc(col3, '<')+2;     
  s1=trim(substr(col3, 1, j1-2));
  s2=compress(substr(col3, j1, j2-j1-2));
  s3=trim(substr(col3, j2));

  s2B=translate(s2, '___', '/@.');
  *put s2=  s2B=;
  if indexc(s2, '@./')>0 then do;
     rc=h.find(key:s2);
     if rc=0 then h.replace(key:s2, data:s2, data:s2B);
     else h.add(key:s2, data:s2, data:s2B);
  end;

  j=1;
  do until (scan(trim(col3), j, ' ')='');
     w=subpad(scan(col3, j, ' '), 1, 32);  
        if h.find(key:w)=0 then do;
     word2=wordtrans;
     s3=tranwrd(s3, compress(w), compress(wordtrans));
     *put s3=  wordtrans=;
     *put '^^^^^^^^^';
  end;

  j+1;
  end;

  if s1 in ('*', '-', '+', '/', '**') then  s3=translate(trim(left(s3)), s1, ' '); 
  else if s1 not in ('eeocf', '=') then s3=compress(cats(s1, '(', s3, ')'));
  rc=h.find(key: subpad(s2, 1, 32));
  if rc=0 then  s3=cats(compress(wordtrans), '=', s3);   
  else s3=cats(s2, '=', s3);

  s3=tranwrd(s3, 'ACTUAL.', ' ');
  *put s3;  
  *put '________________';
  if index(s3, 'ERROR')=0 | index(s3, 'RESID')=0 then
     call execute(s3||';');
  if eof then do;
     call execute ('drop ¶_names;');
        call execute ('run;');
  h.output(dataset:'_test');
  end;
run;



data  Mean(drop=_PRED:)   FirstDerivatives(drop=PRED:);
        set compute(keep=Year PRED: _PRED:);  
run;

proc sql noprint;
        select cats(a.name, ' = ', substr(a.name, index(a.name, '__')+2)) into:renamestmt separated by ' '
  from  sashelp.vcolumn as a
  join  _test as b
    on  compress(a.name)=compress(b.wordtrans)
  where  a.libname='WORK' and a.memname='FIRSTDERIVATIVES'
  ;
quit;
%put &renamestmt;

proc sort data=derivatives_0; by wrt; run;
proc sort data=derivatives_g; by wrt; run;
data _dev;
     merge derivatives_0(keep=wrt in=_0) 
        derivatives_g(keep=wrt in=_g)  end=eof
  ;
  by wrt;
  length _temp_ $ 1024;
  retain _temp_ ;
  diff=(_0-_g); 
  if _n_=1 then _temp_=';';
  if diff=1 then _temp_=cats(_temp_, ';', wrt, '=0');
  put _temp_=;
  if eof then call symput ('impute', _temp_);
run;
%put %bquote(&impute);


data FirstDerivatives;
        set FirstDerivatives;
  rename &renamestmt;
  &impute;
  ;
run;

proc score data=firstderivatives  score=covb  type=parms  out=_temp1;        
        var ¶_names;
run;

data _temp1;
     set _temp1(drop=¶_names);  
run;
proc sql noprint;
     select cats(name, '=', translate(name, '', '2')) into :renamestmt separated by ' '
  from   sashelp.vcolumn
  where  libname='WORK' 
      &  memname='_TEMP1'
   &  varnum>1
   ;
quit;
%put &renamestmt;

data _temp1_score;
     set _temp1;
  rename &renamestmt;
  _TYPE_='parms';
  _NAME_=compress('v'||year);
run;

proc score data=firstderivatives  score=_temp1_score  type=parms  out=v_G(keep=Year v:);        
     by year;
     var ¶_names;
run;

data v_G;
     set v_G;
  _var_=sum(of v:, 0);
  drop v:;
run;

proc datasets library=work nolist;
     delete _temp1 _temp1_score  _new _test;
quit;

SAS Programming for Data Mining

Page Title