Thursday, December 18, 2014

%SVD macro with BY-Processing

For the Regularized Discriminant Analysis Cross Validation, we need to compute SVD for each pair of \((\lambda, \gamma)\), and the factorization result will be feed to the downdating algorithm to obtain leave one out variance-covariance matrix \(\hat{\Sigma}_{k\v}(\lambda, \gamma)\). The code is a direct modification of my original SVD macro @here. It is much more than simply put "BY-" statement in PROC PRINCOMP, but still easy to do. I used the within class CSSCP from the Iris data shipped with SAS and cross verified with R. %CallR macro is used here.

The screenshot below shows the SAS macro generates the same result as from R. Here is the test code.

dm log 'clear';
proc discrim data=sashelp.iris noclassify noprint
               outstat=iris_stat(WHERE=(_TYPE_ = "CSSCP" & Species ^= " "))
      ;
   class Species;
   var   SepalLength SepalWidth  PetalLength  PetalWidth;
run;
            
%svd_bystmt(sigma, output_v, output_s, output_u, &covars, Species, , nfac=0);

data _null_;
     file "d:\data\sigma.csv"  dlm=",";
  set iris_stat;
  if _n_=1 then do;
     put 'Species'      ',' 
            '_NAME_'       ',' 
            'SepalLength'  ',' 
            'SepalWidth'   ','   
            'PetalLength'  ','   
            'PetalWidth';
  end;
  put Species  _NAME_ SepalLength SepalWidth  PetalLength  PetalWidth;
run;

%RScript(d:\data\rscript.r)
cards4;
dat=read.csv("d:\\data\\sigma.csv", header=1, as.is=1)
head(dat)
sigma=dat[, c(-1, -2)]
sigmasvd=by(sigma, dat$Species, svd)
sigmasvd$Setosa$d
sigmasvd$Virginica$d
sigmasvd$Setosa$v
;;;;
run;

%CallR(d:/data/rscript.r, d:/data/rlog1.txt);

data _null_;
     infile "d:\data\rlog1.txt";
     input;
     put _infile_;
run;
data _null_;
     do until (eof1);
        set output_s  end=eof1;
     put @1 Species= @24 Number=  @40 Singuval=  8.2;
  end;
  put "             ";
  do until (eof2);
     set output_v  end=eof2;
  where Species="Setosa";
  put  Prin1  8.5 Prin2 8.5 Prin3 8.5 Prin4 8.5;
  end;
  stop;
run;








%macro SVD_bystmt(
           input_dsn,
           output_V,
           output_S,
           output_U,     
           input_vars,
    by_var,
           ID_var,
           nfac=0
           );

%local blank   para  EV  USCORE  n  pos  dsid nobs nstmt
       shownote  showsource  nbylevel  bystmt;

%let shownote=%sysfunc(getoption(NOTES));
%let showsource=%sysfunc(getoption(SOURCE));
options nonotes  nosource;

%let blank=%str( );
%let EV=EIGENVAL;
%let USCORE=USCORE;

%if &by_var eq &blank %then %do;
    %let bystmt = ␣
 %let nbylevel = 1;
%end;
%else %do;
    %let bystmt = by &by_var;
%end;

%let n=%sysfunc(countW(&input_vars));

%let dsid=%sysfunc(open(&input_dsn));
%let nobs=%sysfunc(attrn(&dsid, NOBS));
%let dsid=%sysfunc(close(&dsid));
%if  &nfac eq 0 %then %do;
     %let nstmt=␣ %let nfac=&n;
%end;     
%else %do;
     %let x=%sysfunc(notdigit(&nfac, 1)); 
  %if  &x eq 0 %then %do;
          %let nfac=%sysfunc(min(&nfac, &n));
          %let nstmt=%str(n=&nfac);
  %end;
  %else %do;
          %put ERROR: Only accept non-negative integer.;
          %goto exit;
  %end;
%end;

/* calculate U=XV/S */
%if &output_U ne %str() %then %do;
    %let outstmt=  out=&output_U.(keep=&by_var &ID_var  Prin:);
%end;
%else %do;
    %let outstmt=␣
%end;

/* build index for input data set */
proc datasets library=work nolist;
     modify &input_dsn;
  index create &by_var;
run;quit;

%let options=noint cov noprint  &nstmt;

proc princomp data=&input_dsn  
             /* out=&input_dsn._score */
              &outstmt
              outstat=&input_dsn._stat(where=(_type_ in ("&USCORE", "&EV")))  &options;
  &bystmt;
     var &input_vars;
run;

/* 
   need to check if the by_var is Char or Numeric, then set the
   format accordingly
*/
data _null_;
     set &input_dsn;
     type=vtype(&by_var);
  if type="C" then 
     call symput("bylevelfmtvar", "$bylevelfmt");
  else
     call symput("bylevelfmtvar", "bylevelfmt");
run;


data __bylevelfmt;
     set &input_dsn._stat  end=eof;
  &bystmt;
  retain _nbylevel 0;
  retain  fmtname "&bylevelfmtvar";
  if first.&by_var then do;
        _nbylevel+1;
  start=&by_var;  
  label=_nbylevel;
     output;
  keep label start fmtname  ;
  end;
  if eof then call symput("_nbylevel", _nbylevel);
run;
proc format cntlin=__bylevelfmt;
run;
%put &_nbylevel;


%if &output_V ne &blank %then %do;
 proc transpose data=&input_dsn._stat(where=(_TYPE_="&USCORE")) 
                out=&output_V.(rename=(_NAME_=variable))
                name=_NAME_;
   &bystmt;
      var &input_vars;
      id _NAME_;
      format &input_vars 8.6;
 run;
%end;

/* recompute Proportion */
%if &output_S ne %str() %then %do;
 data &output_S;
       set &input_dsn._stat ;
    &bystmt;
       where _TYPE_="EIGENVAL";
       array _s{*} &input_vars;
       array _x{&nfac, 3} _temporary_; 
    Total=sum(of &input_vars, 0);
    _t=0;
    do _i=1 to &nfac;
       _x[_i, 1]=_s[_i]; 
          _x[_i, 2]=_s[_i]/Total; 
       if _i=1 then 
             _x[_i, 3]=_x[_i, 2]; 
       else 
             _x[_i, 3]=_x[_i-1, 3]+_x[_i, 2];
       _t+sqrt(_x[_i, 2]);
    end;
    do _i=1 to &nfac;
       Number=_i;  
       EigenValue=_x[_i, 1]; Proportion=_x[_i, 2]; Cumulative=_x[_i, 3];
       S=sqrt(_x[_i, 2])/_t;  SinguVal=sqrt(_x[_i, 1] * &nobs/&_nbylevel);
       keep &by_var Number EigenValue  Proportion Cumulative  S SinguVal;
       output;
    end;
 run;
%end;

%if &output_U ne %str() %then %do; 
 data &output_U;
      array _S{&_nbylevel, &nfac}  _temporary_;  
      if _n_=1 then do;
         do until (eof);
            set  &output_S(keep=&by_var Number SinguVal)  end=eof;
      where Number<=&nfac;
      %if &by_var eq &blank %then %do;
          _row=1;
      %end;
      %else %do;
                   _row = input(put(&by_var, &bylevelfmtvar..), best.);
      %end;
            _S[_row, number]=SinguVal; 
            if abs(_S[_row, number]) < CONSTANT('MACEPS') then _S[_row, j]=CONSTANT('BIG');
         end;
     end;
     set &output_U;
     array _A{*}  Prin1-Prin&nfac;
     %if &by_var eq &blank %then %do;
        _row=1;
  %end;
        %else %do;
           _row = input(put(&by_var, &bylevelfmtvar..), best.);
  %end;
     do _j=1 to dim(_A);
         _A[_j]=_A[_j]/_S[_row, _j];
     end;
     keep &by_var &ID_var Prin1-Prin&nfac ;
 run;
%end;

proc datasets library=work nolist;
     modify &input_dsn;
  index delete &by_var;
run;quit;


%exit: 
options &shownote  &showsource;
%mend;

Monday, December 15, 2014

Experient downdating algorithm for Leave-One-Out CV in RDA

In this post, I want to demonstrate a piece of experiment code for downdating algorithm for Leave-One-Out (LOO) Cross Validation in Regularized Discriminant Analysis [1].

In LOO CV, the program needs to calculate the inverse of \(\hat{\Sigma}_{k\v}(\lambda, \gamma)\) for each observation v to obtain its new scoring function at a given pair of regularizing pair parameter \( (\lambda, \gamma) \), where \(k\v\) indicates class \(k\) excluding observation \(v\).

As mentioned in [1], to obtain the inverse of this variance covariance matrix for class k, it is not sufficient to just remove observation \(v\) and recalculate it, but instead, we try to calculate $$W_{k\v}(\lambda, \gamma)\hat{\Sigma}^{-1}_{k\\v}(\lambda, \gamma)$$, which is equivalent to compute the inverse of :

\[
  W_k(\lambda)\hat{\Sigma}^{-1}_k (\lambda) - \gamma \|Z_v\|^2  \cdot  I - (1-\gamma)Z_v Z_v^{t} \]

In this formula, the first element \(W_k(\lambda)\hat{\Sigma}^{-1}_k (\lambda) \) does not involve quantities varying along with observation \(v\) and can be pre-computed. The latter two can be easily updated on the fly as we scan through each individual observations.

With this in mind, we use the iris data for demonstration. Note that the SAS code here is for illustrating the concept of downdating algorithm above for a given pair of \( (\lambda, \gamma) \). The computing is divided into several steps. (1) We first use PROC DISCRIM to calculate CSSCP matrix, total weight and variable mean by class. These quantities correspond to \( \Sigma_k \), \( \bar{X_k}\) and \( W_k\) in the original paper. (2) We then use the %SVD Macro @here to obtain key elements for later updating.

Note that in order to obtain CSSCP, MEAN and total weights, in addition to PROC DISCRIM, we can also use PROC CORR. The advantage is that we can keep all computing within the framework of SAS/Base so to benefit budget-sensitive business and maximize compatibility. The down side is that the data set need to sorted or indexed to obtain CSSCP by class and the data has to be passed twice, one for pooled CSSCP and the other for class specific CSSCP.

(to be completed...)
 

/* study downdating algorithm */
%let lambda = 0.3;
%let gamma = 0.6;
%let target = Species;
%let covars = SepalLength SepalWidth  PetalLength  PetalWidth;

/* step 0. Obtain key statistics from DISCRIM */
proc discrim data=iris  outstat=stat noprint  noclassify;
      class ⌖
   var   &covars;
run;

/* step 1. Obtain \Sigma_k(\lambda) & \Sigma_k(\lambda, \gamma)
           \S_k = (_TYPE_="CSSCP' & &target = "something")
           \S  = (_TYPE_="CSSCP" & &target = " ")
           \W_k = (_TYPE_="N" & &target = "something") -1
           \W  = (_TYPE_="N" & &target = " ") - &nClass + 1
           \Sigma_k(\lambda) = \S_k(\lambda) / \W_k(\lambda)
              where:
                    \S_k(\lambda) = (1-\lambda)*S_k + \lambda*S
                    \W_k(\lambda) = (1-\lambda)*W_k + \lambda*W

          \Sigma_k(\lambda, \gamma) = (1-\lambda)*\Sigma_k(\lambda) + \gamma* trace[\Sigma_k(\lambda)]/p*I         
*/ 
proc sql noprint;
     select distinct &target into :targetvalues separated by " "
  from   stat
  where _TYPE_="CSSCP"
  ;
  select distinct _NAME_ into :covars separated by " "
  from   stat
  where _TYPE_="CSSCP"
  ;
quit;
%put &targetvalues;
%put &covars;

%let nClass=%sysfunc(countw("&targetvalues"));
%put &nClass;
%let nVars=%sysfunc(countw("&covars"));
%put &nVars;

data weights;
     set stat;
  where _TYPE_="N";
  array _nv{*} _numeric_;
  if &target="" then _total=_nv[1]-&nClass+1;
  retain lambda λ
  retain _total;
  weight=(1-&lambda) * (_nv[1]-1) + &lambda*_total;  
  keep &target   weight  lambda;
  if &target ^= "";
run;

proc sort data=stat  out=sigma;
      by _TYPE_  ⌖
   WHERE _TYPE_ = "CSSCP" & &target ^= " ";
run;
proc sort data=weights;
     by  ⌖
run;

/*-  step 1.1 Update \Sigma_k(\lambda) = ((1-\lambda)*S_k + \lambda*S) / ( (1-\lambda)*W_k + \lambda*W )  -*/
                                                                                          
data sigma;
     array _sigma{&nVars, &nVars} _temporary_;
  array _x{&nVars} &covars;
  array _y{&nClass} _temporary_;  
  if _n_=1 then do;
     _iter=1;
     do until (eof1);
        set stat(where=(_TYPE_="CSSCP" & &target = " "))  end=eof1;
     do _j=1 to &nVars;
        _sigma[_iter, _j]=_x[_j];
     end;
     _iter+1;
     end; 
        
     _iter=1; 
     do until (eof2);
     set weights end=eof2;
        _y[_iter]=weight;
     _iter+1;
  end;
  put _y[3]=;
  
  _target=⌖
  _iter=0;
  end;
  modify sigma ;  
  retain  weight;
  retain  _target;
  _TYPE_="COV";
 
  if &target^=_target then do;
        _iter+1;
        _i=0; 
  weight=_y[_iter];
  _target=⌖
  end;
  _i+1;
  put "&covars";
  put &covars;
  do _j=1 to &nVars;
     _x[_j]= ((1-&lambda)*_x[_j] + &lambda*_sigma[_i, _j])  ;
  end;
  put &covars;
  put "-----------------------------------------------------------------";
  replace;  
run;

/*- trace is sum of every elemnt of a matrix -*/
data _trace;
     set sigma;
  by ⌖
  array _x{*} &covars;
  retain trace _iter;
  if first.&target then do;
     _iter = 1;
  trace=0;
  end;
  trace+_x[_iter];
  _iter+1;
  if last.&target then output;
  keep &target  trace;
run;


/*- step 1.2 Update \Sigma_k (\lambda, \gamma) = (1-\gamma)*\Sigma_k(\lambda) + \gamma *c*I -*/
/*- where  c = trace(\Sigma_k(\lambda)) / p, where p=&nVars                             -*/
data sigma;
      if _n_=1 then do;
      declare hash h(dataset:'_trace');
   h.defineKey("&target");
   h.defineData("trace");
   h.defineDone(); 
   call missing(trace);
   end;
   set sigma; by ⌖
   array _x{*} &covars;
   retain _adj  trace _iter;
   if first.&target then do;
      _iter=1;
   end;
   if _iter=1 then do;
         _rc=h.find();
         _adj= &gamma/&nVars * trace;
   end;
   put _iter=  &target=;
   do _j=1 to &nVars;
      if _j=_iter then do;
      _x[_iter] = (1-&gamma)*_x[_iter] + _adj;
   end;
   else do;
            _x[_iter] = (1-&gamma)*_x[_iter];
   end;
   end;
   _iter=_iter+1;
   drop _type_   _adj _iter _j  _rc  trace;
run;



/*-step 1.3 Obtain the inverse of W_k \Sigma_k(\lambda, \gamma) - c*I  using SVD                                     -*/  
/*-This inverse is the same as first inverse \Sigma_k(\lambda, \gamma), then with S matrix deflated by W_k(\lambda)  -*/
/*-and differenced by c, where c=\gamma* |Z_v|^2/p, and Z_v = sqrt(bk(v))*(X_v- mean(X_k\v),                         -*/
/*-bk(v)=sk(v)*Wc(v) wv/(Wc(v)-wv)                                                                                   -*/
/*-v is the current observation                                                                                      -*/

/*-       1. apply SVD to obtain eigenvalues   
%SVD(sigma, out_u, out_S, out_V)                                                                                     -*/


/*-       2. The whole CV will be done in one Data Step
             Because all related quantitaties of the rest, such as such as |Z_v|, sk(v), Wc(v) 
             will require at least one pass through of the original data, therefore it is
             best to incorporate all computing in this step.
-*/
data _iris;
     array _m{*} &covars;
     array _mt{&nVars} _temporary_;
     if _n_=1 then do;
     _class=1;
        do until eof;
     set stat(where=(_TYPE_='MEAN'& &target^=""))  end=eof;
     do _j=1 to dim(_m);
        _mt[_class, _j]=_m[_j];
     end;
     _class+1;
  end;
  end;
  array _rank1{&nVars, &nVars} _temporary_;
  set iris point=1;  
  do _j=1 to dim(_m);
     _m[_j]=_m[_j]-_mt[1, _j];
  end;
  do _j=1 to &nVars;
     do _k=1 to &nVars;
           _rank[_j, _k]=_m[_j] * _[_k];

           /*  MORE CODE TO COME*/

     end;
  end;
run;

[1] J. H. Friedman. Regularized discriminant analysis. Journal of the American Statistical Association, 84(405):165–175, 1989.

Control Excel via SAS DDE & Python win32com

Excel is probably the most used interface between human and data. Whenever you are dealing with business people, Excel is the de facto means for all things about data processing.

I used to only use SAS and Python for number crunching but in one of my recent projects, I need to work out a way to send summarized data to Excel for charting, specifically charting in the Excel flavor where the style has particularity and clients need data to be associated with the chart, just like a copied Excel chart. It is extremely difficult to fine tune your SAS or Python figures to mimic the exact required Excel style, no to mention interactive data display on chart that comes handy with Excel, so I decided to study how to control Excel via SAS and Python.

Python is for standalone PC implementation while SAS runs on the analytic server, each serves different customer groups. In my situation, we have MS Office 2013 Professional installed on both client PC and the server. In Python, we use win32com to control Excel while in SAS, I use the old school Dynamic Data Exchange (DDE) . DDE is obsolete technology but I don't have much experience in using COM within SAS, nor did I found good examples in this aspect.

In this blog, I show how to generate the following Excel worksheets and copy the Charts with data to PowerPoint, in Python and in SAS.

For python scripting, I largely follow the posts from Dan's Python Excel blogs, and the Office IPython blog @here. You are welcome to check out his scripts there and develop your own. In order for win32com to work in your Python environment, it is required to install pywin32 package, which can be found on its project page @here . You may also want to create static dispatch use makepy.py from the project code and installs all generated constants from a type library in the object win32com.client.constants.

For SAS scripting, I largely follow SAS manual outlined above and some SAS user papers on DDE, especially using DDECMD. But since in PowerPoint 2013 macro recording is not longer available, the work will stop at Excel.

In the following, I will demonstrate, via comparing both languages, how to:
1. Invoke Excel Application
2. Reading data and generate contingency tables
3. Send the table data to Excel
4. Generate charts in Excel driving by SAS or Python
5. Copy Excel charts to PowerPoint with data embedded.



(to be completed .. )





Tuesday, July 01, 2014

%HPGLIMMIX SAS macro is available online at JSS website

My paper "%HPGLIMMIX: A High-Performance SAS Macro for GLMM Estimation" is now available at Journal of Statistical Software website @here.

SAS macro and code can also be found there. If you use it, please kindly send me an email so that I know my work is appreciated and is helpful to more people.
===========

Update:

The paper and the macro are well accepted: 400+ downloads for the paper and 90+ downloads for the macro for the first month it is online and I've got many good feedbacks. Several key points to make:

1. For binary dependent variable, you have to code your data into numeric {0, 1} and the macro will model the probability dependent variable having value 1. So if you have other requirements, you have to process the data before calling the macro.

2. This macro relies on PROC HPMIXED, so that it will generate two residual parameters by default, therefore for conditional distributions without scale parameters, such as Poisson, users need to hold the last residual variance to be 1 by using PARMS /HOLD= statement. Please see the explanation of second difference between this macro and PROC GLIMMIX on Page 9, as well as Example 2.

Friday, December 06, 2013

Market trend in advanced analytics for SAS, R and Python

Disclaimer: This study is a view on the market trend on demand of advanced analytics software and their adoptions from the job market perspective, and should not be read as a conclusive statement on what is all happening there. The findings should be combined with other third party research results such IDC reports http://www.idc.com/getdoc.jsp?containerId=241689 to reach a balanced and comprehensive idea.

/* --------------------------------------------------------------------------------------------------------------------*/
The debate on the competition between SAS, R and Python for advanced analytics, Statistics or Data Science in general never ends. The question boils down to how far SAS, as a software and programming language for such purpose, can still be used by business in mass, given its current pricing structure.

Some recent discussions can be found at:

http://www.listserv.uga.edu/cgi-bin/wa?A2=ind1312a&L=sas-l&F=&S=&P=1171

http://r4stats.com/2013/03/19/r-2012-growth-exceeds-sas-all-time-total/

http://r4stats.com/articles/popularity/

and more...

indeed.com has been used to address such question before by Robert Muenchen @ http://r4stats.com/articles/popularity/. His analysis on popularity of SAS and R in the job market using indeed.com was simple. The terms he used were just "R SAS" and "SAS !SATA !storage !firmware".

In my analysis using indeed trend, I directly search combination of languages such as R, SAS and Python with analytics related job terms, such as "R AND Analytics" or "R AND Regression", etc. The goal is to understand the market dynamics in adopting each of the three languages in advanced business analytics. I try to be as fair as possible in my analysis.

The job description related terms that I am going to combine with the language names fall into three categories:

1. Techniques typically used, here I use "Regression", which almost all advanced business analytics job will look for;
2. Industry, here I use "pharmaceutical", and I used "JPMorgan Chase" to represent financial service (even though it is not representative"
3. General term such as "data mining", "machine learning" and "statistics". In general, data mining has different implication from machine learning. The former is a more general term, or a modern term for "analytics" while the latter shows up momentum only recently and largely focus on computer science related field and more hardcore on algorithm development, etc, so favoring Python over the other two.

The graph below shows indeed.com job trends using search term "R and regression, SAS and regression, Python and regression". It has three immediate pieces of information to tell:
a.) Python has been picking up very fast since 09;
b.) The market share gap between SAS and R/Python is consistently dropping since 2006 and in foreseeable future the lead of SAS is going to disappear.
c.) SAS's market share reached high peak in 2011 but kept dropping thereafter while Python maintains steady trend and R is picking up in the same time period.




The below graph searches the trend using term "R and pharmaceutical", etc. I wanted to see how the trend of adopting those three languages in pharmaceutical industry. What we found out are three points:
a.) R and SAS almost show up at the same time in pharmaceutical jobs
b.) The relative job demand for analytics in pharmaceutical industry is declining.
c.) Python is on the rise even though still trailing far behind SAS and R.



When using "JPmorgan Chase" combined with languages for trend analysis, I specifically excluded "storage" from the SAS search term because the bank may hire some IT folks and result will mix with SAS storage system. The term is ' R and "jpmorgan chase", SAS not Storage and "jpmorgan chase", Python and "jpmorgan chase" '. There are several interesting observations:
a.) Python is picking up some job shares in JPmorgan Chase from 2010. R is losing;
b.) There is noticeable seasonality in hiring, mostly in summer time;
c.) Before 2008, there was strong demand for analysts using SAS, then the crisis came and hiring stalled for almost 2 years. Beginning in 2011, the business is getting better and the demand for analysts using SAS is picking up all year around and attained peak in this year.


In our last part of trend analysis, we want to combine the languages with more general terms such as "analytics", "data mining" to see how the whole business perceives these three analytical languages.

We first combine with "data mining". Three points need to be noted:
1. The job market for analytics in general has been increasing since 2009 and all languages shows good momentum from 2009 to 2010;
2. SAS was the dominant player in this business, but the landscape is changing rapidly. Both R and Python see almost identical adoption by the business. There are some minimal gap between these two but I would consider it not significant;
3. SAS is not as favored by the market as before and its share not only stalled but also showed some declining trend this year. This is very alarming.


Now, let's be more focused. First, we study "Statistics". The message is not good for SAS here. SAS used to be THE de facto software for industrial statistical analysis. This fact is reflected by the trend before 2010. SAS is so dominant that its market share is more than 3X comparing to R and even more comparing to Python. But what's interesting is that while SAS saw some good days from 2010 to 2011, which also reflected in their annual revenue", its market share is on a steady down road from 2012. On the other hand, R and Python are still picking up without hesitation.



Turning to "machine learning", Python is the leader, followed by R. SAS sees some distance here. The seasonality of Python is a little different from R and SAS while the latter two have very close seasonal dynamics. The hypothesis here is that Python is dominantly used in different industries while R and SAS share similar industries.

Friday, July 19, 2013

I don't always do regression, but when I do, I do it in SAS 9.4

There are several exciting add-ins from SAS Analytics products running on v9.4, especially the SAS/STAT high performance procedures, where "high performance" refers to either in single-machine multi-threading mode or full distributed mode.

 HPGENSELECT: high performance model fitting and variable selection for GLM within standard exponential family;

 HPLOGISTIC: high performance logistic regression

 HPLMIXED: high performance Linear Mixed Model, unfortunately it still strictly a LMM, so does not support the DIST= option in MODEL statement

 HPNLMOD: high performance nonlinear LS or MLE

 HPREG: high performance for OLS, luckily HPREG supports CLASS statement and includes LAR/LASSO variable selection methods. On the other hand, HPREG does not supports best-subsets variable selection methods.

 HPSPLIT: high-performance utility procedure that creates a decision tree model and saves results in output data sets and files for use in SAS Enterprise Miner.

Wednesday, May 08, 2013

Finding the closest pair in datat using PROC MODECLUS

 
 
UPDATE: Rick Wicklin kindly shared his visualization efforts on the output to put a more straightforward sense on the results. Thanks. Here is the code, run after my code below. Note that this is designed for K=2.
 

proc iml;
use out;      read all var {ID x y}; close out;
use neighbor; read all var {Nbor}; close neighbor;
NBor = num(Nbor);         /* convert to numeric */
xEnd = x[Nbor];
yEnd = y[Nbor];
create Viz var {ID x y xEnd yEnd}; append; close Viz;
quit;
proc sgplot data=Viz noautolegend;
        scatter y=y x=x / datalabel=id;
        vector x=xEnd y=yEnd / xorigin=x yOrigin=y;
run;

***************************************************************************



More often than not, the analyst wants to find the pair of observations that are the closest in terms of certain metrics, such as Euclidean distance. For example, on the popular SAS-L archive group, Randall Powers posted such a question at here, but he was using PROC DISTANCE.

His idea is straightforward: First calculate the pairwise distance among all observations, and Second, find the one that has the shortest distance. For the first step, he was trying to use PROC DISTANCE, then then he planned to use DATA STEP to search through. This is not workable in even the modest data size, say 100K observations, because PROC DISTANCE will generate a dense double precision 100K-by-100K matrix and will blow up your workstation. That is the reason Randall turns to SAS-L for help.

Well, SAS actually has an old PROC that can readily solve his problem. Of course, before a solution can be crafted, there are several issues that have to be clarified first.

First, what if some observations share the same closest observations, do you allow shared case or you need a mutually exclusive solution that each observation only has its unique non-shared counterpart as the closest point, then how would you determine which one should take the shared observation? First come first serve or any other rule?

Second, when we talk about pairs, we know N should be even number, but what if the data we have has an odd number for N? How to deal with the left over?

That being said, let's consider the simplest and most tolerate case, that is each observation is allowed to serve as the closest point to multiple other observations and N is an even number.

The solution uses PROC MODECLUS. This is an old procedure for density based clustering analysis, but its build-in nearest neighbor search algorithm and output capability make it the perfect candidate for this type of job. We use the example data in the "Getting Started" section.

The ODS OUTPUT statement directly outputs the list of nearest neighbors (closest points) for each observation and your have to specify either ALL or NEIGHBOR option in the PROC MODECLUS statement in order to use this functionality. In the same statement, we also specify K=2. K=2 means 2 nearest neighbor but since this applies for all observations, for each one, the nearest neighbor is K-1=1. So if you specify K=3, you actually ask this procedure find 2 closest point for each 1 observation at hand.

The good thing about the nearest neighbor list data set is that it also contains the calculated distance, therefore, in cases you need to deal with more complex situation as listed above, such as non-shared nearest neighbor, etc, you need some room for post processing. Here you can specify K=3 and PROC MODECLUS will output 2 neighbors for each observation, like below.


You can see that observation 3 can be matched to either observation 1 or 2, but with 2, it yields the shortest distance, and if this is the rule you are going to apply, your post process can work on this data set to implement the rule.

Please feel free to verify the result. If you find error, let me know or post in the comment.

 
data example;
      input x y @@;
   ID=_n_;
      datalines;
   
   18 18  20 22  21 20  12 23  17 12  23 25  25 20  16 27
   20 13  28 22  80 20  75 19  77 23  81 26  55 21  64 24
   72 26  70 35  75 30  78 42  18 52  27 57  41 61  48 64
   59 72  69 72  80 80  31 53  51 69  72 81
   ;
run;

ods select none;
ods output neighbor=neighbor; /* ODS output dataset */
proc modeclus data=example method=0  k=2    /*Find a pair of nearest neighbor*/
                 out=out  
                 all  /* required option in order to output Nearest Neighbors*/;
     var x y;
  id  id;
run;
ods select all;


Tuesday, March 26, 2013

Large Scale Linear Mixed Model

Update at the end:
****************************;
Bob at r4stats.com claimed that a linear mixed model with over 5 million observations and 2 million levels of random effects was fit using lme4 package in R:


I am always interested in large scale mixed model like this and would appreciate anyone who can point me to the example Bob quoted (update: Bob responded in comment, please check it out). I left a reply to Bob on his blog but no response yet. I also tried to Google this example but all examples returned are on a scale with hundreds up to 5 thousand of levels of random effects, which is very common and reasonable. While I don't have the data used in the quoted example, I decided to see how long it takes for a mixed model with similar scale to be fitted in SAS's PROC HPMIXED. Therefore I borrowed Tianlin Wang's simulated data example in his paper: All the Cows in Canada: Massive Mixed Modeling with the HPMIXED procedure in SAS 9.2. But I modified the code a little bit so that there will be 400K cows to be included in the random effect and I used SUBJECT= option to accelerate the computing with by-subject processing.


%let NFarm = 1000;
%let NAnimal = %eval(&NFarm*400);
%let Va = 4.0;
%let Ve = 8.0;
data Sim;
    keep Species Farm Animal Yield;
    array BV{&NAnimal};
    array AnimalSpecies{&NAnimal};
    array AnimalFarm{&NAnimal};
    do i = 1 to &NAnimal;
       BV {i} = sqrt(&Va)*rannor(12345);
       AnimalSpecies{i} = 1 + int( 5 *ranuni(12345));
       AnimalFarm {i} = 1 + int(&NFarm*ranuni(12345));
    end;
    do i = 1 to 40*&NAnimal;
       Animal = 1 + int(&NAnimal*ranuni(12345));
       Species = AnimalSpecies{Animal};
       Farm = AnimalFarm {Animal};
       Yield = 1 + Species
                 + Farm
                 + BV{Animal}
                 + sqrt(&Ve)*rannor(12345);
       output;
    end;
run;

proc sort data=sim; by animal species farm; run;
options fullstimer;
ods listing close;
ods select none;
proc hpmixed data=Sim;
     class Species Farm Animal;
     model Yield = Species Farm*Species /ddfm=residual;
     random animal /cl ;
     ods output SolutionR=EBV;
run;
ods listing;


For this particular model, it took more than 8 hours to be fitted on a desktop with i5-3570K CPU running at 3.8Ghz. I certainly dare not to test against a case with 2 million levels of random effects and would assume it will be a LONG LONG TIME. Note that with the original 10K levels of random effects, it took only 20 seconds to be fitted on the same machine, and I do know the time consumption is more than super-linear in the number of random effects' levels with all the computing optimization in HPMIXED.


lme4 package supports Sparse matrix techniques only on the random effect design matrix, but HPMIXED procedure supports Sparse matrix techniques to the full Mixed Model Equation, and in many applications, will perform much better:

Take the above example, but using only 150 Farms and assuming 100 Animals within a Farm, we fit the same mixed model using lme4 package and the HPMIXED procedure. The lmer function is called within SAS using the %RScript and %CallR macros (see here).

Left panel shows the status of the machine running lmer function on the mixed model and the right panel shows the status when the machine ran HPMIXED. Because lme4 does not support sparse matrix techniques for fixed effect design matrix, it takes significant penalty on memory usage (12+GB vs 25MB) and computing time (244sec real time vs. 17sec real time). In a later test using only random effects, HPMIXED used 5.7 seconds while lmer used 11.3 second.

Left: Running R; Right: Running SAS