Thursday, October 06, 2011

Bayesian Computation (3)

In Chapter 3 of "Bayesian Computation with R", Jim Albert talked about how to conduct 2 fundamental tasks of Statistics, namely Estimation and Hypothesis Testing in a single parameter framework.

The structure of this chapter is organized as the following:

                  +--------Non-infomative Prior: estimate variance parameter in a normal model
                  |
                  |
+------Estimation +--------Informative Prior : estimate a gamma model
|                 |
|                 |
|                 +--------Sensitivity to the choice of Prior : Robustness
|
|
+-----  Hypothesis Testing : A binomial test


/*** 3.2 ***/
proc import datafile="c:\footballscores.txt"  out=football
            dbms=tab  replace;
run;

data football;
     set  football  end=eof;
     d=favorite - underdog -spread;
     d2=d**2;
     if eof then call symput('n', _n_);
run;

proc means data=football  noprint;
     var  d2;
     output  out=sum  sum(d2)=v;
run;
data _null_;
     set sum;
     call  symput('v', v);
run;


%put &n;
%put &v;


data P;
     call streaminit(100000);
     do i=1 to 1000;
        p=rand('CHISQUARE', &n)/&v;
        s=sqrt(1/p);
        output;
        drop i;
     end;
run;


ods select none;
proc univariate data=p  ;
     var s;
     histogram /midpoints=12.5 to 15.5 by 0.25                      
                cbarline=blue  cfill=white   
                outhistogram=hist;
     title "Histogram of ps";
run;
title;
ods select all;

axis1 label=(angle=90 "Frequency");
proc gbarline data=hist;
     bar _midpt_/discrete  sumvar=_count_  space=0  axis=axis1 ;
     title " ";
run;
title;

/* 3.3 */
data y;            
     retain alpha 16
            beta 15174
            yobs 1
            ex 66
     ;
     lam=alpha/beta; 
     scale=1/beta;
     do y=0 to 10;
        py1=pdf('POIS', y,  lam * ex);
        py2=pdf('GAMMA', lam, alpha, scale);
        py3=pdf('GAMMA', lam, alpha + y, 1/(beta+ex));
        py = round(py1*py2/py3, 0.001);
        output;
        keep y py;
     end;
run;


data lambdaA;
     retain alpha  16  
            yobs   1
            beta   15174
            ex     66
            seed   89878765
     ;
     shape=alpha+yobs;
     scale=1/(beta +ex);
     do i=1 to 1000;
        rannum=rangam(seed, shape)*scale;
        output;
        keep rannum;
     end;
run;       


%let n=20;
%let y=5;
%let a=10;
%let p=0.5;

data _null_;
     m1 = pdf('binomial', &y, &p, &n)
        * pdf('beta', &p, &a, &a)
        / pdf('beta', &p, &a+&y, &a+&n-&y);
  put m1=;
     lambda = pdf('binomial', &y, &p, &n)
            / (pdf('binomial', &y, &p, &n)+m1);
     put lambda=;
run;


data lambda;
     interval=(5-(-4))/100;
     do loga=-4 to 5 by interval;
        a=exp(loga);
        m1 = pdf('binomial', &y, &p, &n)
           * pdf('beta', &p, a, a)                
           / pdf('beta', &p, a+&y, a+&n-&y);
       lambda = pdf('binomial', &y, &p, &n)
              / (pdf('binomial', &y, &p, &n)+m1);   
        output; 
     end;
run;

symbol interpol=j;
axis1  label=(angle=90  "Prob(coin is fair)");
axis2  label=("log(a)");
proc gplot data=lambda;
     plot lambda*loga /vaxis=axis1 haxis=axis2;
run;quit;
goptions reset=all;



/* Robustness of Bayesian Method 
   In this example, Jim tries to estimate the IQ of Joe based on two 
   different prior distributions, Normal and T. The posterior distribution 
   of the estimate will be compared to demonstrate the robustness of 
   Bayesian method.
*/

data summ1;
      retain mu 100 tau 12.16  sigma  15  n 4;
   input ybar;
      se=sigma/sqrt(n);
   tau1=1/sqrt(1/se**2 + 1/tau**2);
   mu1= (ybar/se**2 + mu/tau**2)*(tau1**2);
      keep ybar mu1  tau1;
cards;
110
125
140
;
run; 

data theta;
      interval=(140-60)/200;
   tscale=20/quantile('T', 0.95, 2);
   do theta=60 to 140 by interval;
         tdensity=1/tscale*pdf('T', (theta-mu)/tscale, 2);
   ndensity=1/10*pdf('NORMAL', (theta-mu)/tau);
   output;
   keep theta tdensity  ndensity;
   end;
run;

data summ2;
      tscale=20/quantile('T', 0.95, 2);
      input ybar;
   do theta=60 to 180 by (180-60)/500;
         like=pdf('NORMAL', (theta-ybar)/7.5);
   prior=pdf('T', (theta-mu)/tscale, 2);
   post=prior*like;   
         output; 
   end;
cards;
110
125
140
;
run;

proc stdize data=summ2  method=sum  out=summ2;
      by ybar;
      var post;
run;

data summ2v;
      set summ2;
      m=theta*post;
   s=theta**2*post;
run;
proc means data=summ2v  noprint;
      by ybar;
   var m s;
   output out=summ2(keep=ybar  m  s)  sum(m)=m  sum(s)=s;
run;
data summ2;
      set summ2;
   s=sqrt(s-m**2);
run;

data summ;
      merge summ1  summ2;
run;


/* Hypothesis Testing 
   In this example
*/
data _null_;
     pvalue=2*cdf('BINOMIAL', 5, 20, 0.5);
  put pvalue= ;
run;

data _null_;
     retain n 20
         y  5
   a 10
   p  0.5
   ;
  m1=pdf('BINOMIAL', y, n, p) * pdf('BETA', p, a, a) / pdf('BETA', p, a+y, a+n-y);
  lambda = pdf('BINOMIAL', y, n, p)/(pdf('BINOMIAL', y, n, p) + m1);
  put lambda= best.;
run;


data pbetat_out;
     length _stat_ $ 12;
     input p0  prob  a  b  s  n;
  f=n-s;
  lbf=s*log(p0) + f*log(1-p0) + logbeta(a, b) - logbeta(a+s, b+f);
  bf=exp(lbf);
  post=prob*bf/(prob*bf + 1 - prob);
  _stat_='BayesFactor'; nvalue=bf;
  output;
  _stat_='Posterior';  nvalue=post;
  output;
  keep _stat_ nvalue;
cards;
0.5  0.5 10 10 5 20
;
run;
 
     
data lambda;
      retain n 20
             y  5  
             p  0.5
    a 10
      ;
      do loga=-4 to 5 by (5+4)/100;
         a=exp(loga);
   _x=pdf('BINOMIAL', y, p, n);
         m2=_x*pdf('BETA', p, a, a)/pdf('BETA', p, a+y, a+n-y);
   lambda=_x/(_x + m2);
   keep loga m2 lambda;
   output;
   end;
run;

proc sgplot data=lambda;
      series x=loga  y=lambda;
   label  lambda='Prob(coin is fair)'
          loga='log(a)'
    ;
   format lambda 7.1  loga 7.0;
run;

Obtain Trace of the Projection Matrix in a Linear Regression

Recently, I am working on coding in SAS for a set of regularized regressions and need to compute trace of the projection matrix:
$$ S=X(X'X + \lambda I)^{-1}X' $$.

Wikipedia has a well written introduction to Trace @ here.

To obtain the inverse of matrix (X'X + \lambda I) in SAS/STAT, there are multiple ways:
1. Build the SSCP matrix first, then inverse it using the following methods:
    1.1. Using SVD based method, shown @ here but not demonstrated in this post;
    1.2. Using PROC REG, shown @ here , and also demonstrated in the code below;
    1.3. Using SWEEP operator, shown @ here under Method 0 and Method 1;
2. Since (X'X + \lambda I) is the SSCP for a Ridge Regression with ridge parameter=\lambda, it is handy to directly use the ODS OUTPUT InvXPX= statement to obtain the inversed matrix, when X is appended by a diagonal matrix of \lambda*I_{p, p}. SAS code is demonstrated below;

Trace of the project matrix S is a key concept in modern regression analysis. For example, the effective degree of freedom of the model in a regularized linear regression is trace(S)/N, see [1] for details. For another example, in approximating leave-one-out-cross-validation using GCV, trace(S)/N is a key component (formula 7.52 of ~[1]).

Check the reference and the cited works therein for more information.

Ref:
1. Hastie et al., The Elements of Statistical Learning, 2nd Ed.


/* Use the SocEcon example data created in
   Example A.1: A TYPE=CORR Data Set Produced by PROC CORR
   on page 8153 of SAS/STAT 9.22 User Doc
*/
data SocEcon;
input Pop School Employ Services House;
datalines;
5700 12.8 2500 270 25000
1000 10.9 600 10 10000
3400 8.8 1000 10 9000
3800 13.6 1700 140 25000
4000 12.8 1600 140 25000
8200 8.3 2600 60 12000
1200 11.4 400 10 16000
9100 11.5 3300 60 14000
9900 12.5 3400 180 18000
9600 13.7 3600 390 25000
9600 9.6 3300 80 12000
9400 11.4 4000 100 13000
;
run;

%let depvar=HOUSE;
%let covars=pop school employ services;
%let lambda=1;
/* Below is the way to obtain trace(S), where S is the project matrix in a (regularized) linar regression. 
   For further information, check pp.68, pp.153 of Elements of Statistical Learning,2nd Ed.
*/

/* For details about TYPE=SSCP special SAS data, consult:
  Appendix A: Special SAS Data Sets, pp.8159 of SAS/STAT 9.22 User's Guide
*/
proc corr data=SocEcon sscp out=xtx(where=(_TYPE_='SSCP'))  noprint;
     var &covars;
run;


data xtx2;
     set xtx;
  array _n{*} _numeric_;
  array _i{*} i1-i5 (5*0);
  do j=1 to 5;
     if j=_n_ then _i[_n_]=λ
  else _i[j]=0;
  end;
  _n[_n_]=_n[_n_]+1;
  drop j _TYPE_  _NAME_;
run;

/* Obtain the inverse of (XTX+\lambda*I)
   Note that we explicitly specified Intercept term in the 
   covariate list and fit a model without implicit intercept 
   term in the model.
*/
proc reg data=xtx2  
         outest=S0(type=SSCP
                   drop=i1-i5 _MODEL_  _DEPVAR_  _RMSE_)
         singular=1E-17;
     model i1-i5 = Intercept &covars / noint   noprint;
run;quit;

data S0;
     set S0; 
  length _NAME_ $8;
  _NAME_=cats('X_', _n_);
run;

proc score data=SocEcon  score=S0  out=XS0(keep=X_:)  type=parms;
     var &covars;
run;     

data XS0X;
     merge XS0  X;
  array _x1{*} X_:;
  array _x0{*} intercept pop school employ services;
  do i=1 to dim(_X1);
     _x1[i]=_x1[i]*_x0[i];
  end;
  rowSum=sum(of _x1[*]);
  keep rowSum;
run;
proc means data=XS0X  noprint;
     var rowSum;
  output out=trace  sum(rowSum)=Trace;
run;


Verify the result using R:


> socecon<-read.csv('c:/socecon.csv', header=T)
> x<-as.matrix(cbind(1, socecon[,-5]))
> xtx<-t(x)%*%x
> phi<-xtx+diag(rep(1, 5))
> 
> # method 1. 
> S<-x%*%solve(phi)*x
> sum(S)
[1] 4.077865
> # method 2. 
> S<-(x%*%solve(phi))%*%t(x)
> sum(diag(S))
[1] 4.077865
>

Of course, except for method 1.2 shown above, we can also use Method 2 mentioned above, and obtain the same inverse matrix:


data SocEcon2 /view=SocEcon2;
     set x  end=eof;
  array x{5} Intercept &covars (5*0);
  Intercept=1;
  output;
  if eof then do;
     do j=1 to dim(x);
     x[j]=λ
     output;
     drop j;
     x[j]=0;
  end;
  end;
run;

ods select none;
ods output InvXPX=S1;
proc reg data=SocEcon2  singular=1E-17;
     model y = Intercept &covars /noint  i;
run; quit;
ods select all;


As a side point, it is also of interests to compare the computational performance of both illustrated methods. SVD-based approach and SWEEP operator based approach are omitted.

Using a SAS data set of 1001 covariates (including Intercept term) and 1E6 observations, total 7.6GB on a windows PC, the test was done on two dramatically different machines: one WindowsXP laptop with mediocre HDD and a 2-core T7300 CPU; the other one is a high-end Server running Linux64 with Disk Array and fast CPUs totaled 16 cores.

On the PC,:
-> Using method 1.2, the time used decomposition is listed below:
PROC CORR:  real time: 25:47.72; CPU time: 15:26.15
DATA step on XTX: real time: 2.28 sec; CPU time: 0.07 sec
PROC REG :  real time: 15.45 sec; CPU time: 27.31 sec;
Total real time: 26:05.45

-> Using method  2, the time used is:
PROC REG: real time: 48:28.44; CPU time: 1:16:46.41

On the server:
 -> Using method 1.2, the time used decomposition is listed below:
PROC CORR: real time: 5:40.61; CPU time: 5:40.58
DATA step on XTX: real time: 0.05 sec; CPU time: 0.05 sec
PROC REG: real time 1.71 sec; CPU time: 5.27 sec
Total real time: 5:42.37

-> Using method 2, the time used is:
PROC REG: real time: 6:01.46; CPU time: 19.13.49

The performance is summarized below: