Thursday, June 04, 2009

Randomly Split SAS table exactly according to a given probability vector








*Given probability vector, this program follows the principle of Algorithm S (3.4.2) of Knuth [1]
*Ensure randomness within any level of value combination of control variables
*Only need to pass through data once in spliting step

[1] Don Knuth, "Art of Programming", Algorithm S, section 3.4.2


/*************************************************/
%macro split(
                     seed,           /*seed for random number generator                       */
                     probvector,   /*macro variable of spliting prob vector separated by "  " */
                     control_vars, /*control variable as in PROC SURVEYSELECT                 */
                     in_dsn,         /*input data set                                         */
                     out_dsn=_out_dsn_, 
                     sort=nest
                     );

%local blank  n  nv  probs  _k  idxname  nobs  dsid   seed;
%let blank=%str( ); 
%let n=1; %let nv=1;
%let probs=&probvector;

%let dsid=%sysfunc(open(&in_dsn));
%if  (&dsid=0) %then %do;
     %put %sysfunc(sysmsg());
     %put Stop Processing due to Errors;
     %goto exit;
%end;
%else %do;
    %let nobs=%sysfunc(attrn(&dsid, NLOBS));
    %let dsid=%sysfunc(close(&dsid));    
%end;

%if &out_dsn=&blank %then %let &out_dsn=&in_dsn;
%if &seed=&blank %then %let seed=0;

%let pos=%scan(&probs,&n,&blank); 
%do %while(&pos ne &blank); 
    %let n=%eval(&n+1); 
    %let pos=%scan(&probs,&n,&blank); 
%end;
%let n=%eval(&n-1);

%let pos=%scan(&control_vars,&nv,&blank);
%let control_vars_x=&pos;
%let control_key=%str(&pos);
%let control_find=%str(key:&pos); 
%do %while(&pos ne &blank); 
    %put &pos;
    %let nv=%eval(&nv+1); 
    %let pos=%scan(&control_vars,&nv,&blank); 
    %if &pos^=&blank %then %do;
        %let control_vars_x=%str(&control_vars_x * &pos);
        %let control_key=%str(&pos%", %"&control_key);
        %let control_find=%str(key:&pos, &control_find);    
    %end;
%end;
%let nv=%eval(&nv-1);
%put nv=&nv;
%if %eval(&nv>0) %then %do;
    %if %upcase(&sort)=NEST %then %do;
        proc sort data=&in_dsn out=_in_dsn_; 
             by &control_vars; 
        run;
    %end;
    %else %if %upcase(&sort)=SERP %then %do;
          ods select none;
          proc surveyselect data=&in_dsn samprate=1.0 out=_in_dsn_
                            sort=&sort  method=sys;
               control  &control_vars;
          run;
          ods select all;
    %end;
    %else %do;
         %put Specify only NEST or SERP for sort statement;
         %goto exit;
    %end;
    %let in_dsn=_in_dsn_;

    proc freq data=_in_dsn_ noprint;
         tables &control_vars_x /missing out=_freq_;
    run;

    proc sql noprint; 
           select name into :keyvars separated by '  '    
           from  sashelp.vcolumn
           where libname='WORK' & 
                     upcase(memname)='_FREQ_' & 
                     name not in ('COUNT', 'PERCENT')
           ;
    quit;
 %let _k= %sysfunc(countW(&keyvars));
 %if %eval(&_k=1) %then %let idxname=&keyvars;
 %else %let idxname=idx;
    data  _freq_(index=(&idxname.=(&keyvars)));
            set _freq_  nobs=ntotal  end=eof;
            _index_=_n_;
            if eof then call symput('ncomb', ntotal);
    run;
    %let withcontrol=1;

%end;
%else %do;
    %let withcontrol=0;
    %let ncomb=1;
%end;

proc sql noprint;
       select name into :originalvars separated by ' '
       from  sashelp.vcolumn
       where libname='WORK' & 
                     upcase(memname)='_IN_DSN_'
      ;
quit;

data probs;
     %do i=1 %to &n;
         _prob_&i=%scan(&probs, &i, &blank);
     %end;        
     output;
     drop i;
run; 

%let i=&n;
%put Initial counts: n=&n  i=&i   ncomb=&ncomb;
data &out_dsn      _N;
     array _P{*}  _prob_1-_prob_&n;
     array _N{0:%eval(&n+1)}  N_0-N_%eval(&n+1);
     array _Pmat{1:&ncomb, 1:&n} _temporary_;     
     array _Fmat{1:&ncomb, 1:&n} _temporary_;
     array _Xmat{1:&ncomb, 1:&n} _temporary_;
     array _Kmat{1:&ncomb, 1:&n} _temporary_;
     array _Nmat{1:&ncomb, 0:%eval(&n+1)} _temporary_;    

     if _n_=1 then do;
        set probs;        
        do i=1 to &ncomb;
           do j=1 to &n;
              _Pmat[i, j]=_P[j];
              if j=1 then _Fmat[i, j]=_P[j];
              else _Fmat[i, j]=_Fmat[i, j-1]+_P[j];
           end;
           do j=1 to &n;
              _Fmat[i, j]=_Fmat[i, j]/_Fmat[i, &n];
           end;
        end;
        put 'FINISH LOADING PROB VECTORS';
        %if &withcontrol=1 %then %do;            
            do while (^eof);
               set  _freq_  end=eof;              
               cn=0; 
               _Nmat[_index_, 0]=COUNT; _Nmat[_index_, %eval(&n+1)]=0;   
               do j=1 to &n;
                  if j<&n then do;
                     _Nmat[_index_, j]=round(_P[j]*COUNT, 1);
                     cn+_Nmat[_index_, j];
                  end;
                  else _Nmat[_index_, j]=COUNT-cn;
                  _Xmat[_index_, j]=_Nmat[_index_, j];
                  _Kmat[_index_, j]=0;
               end;               
            end;
        %end;    
        %else %do;
            _index_=1; cn=0;
            do j=1 to &n;
               if j<&n then do;
                  _Nmat[_index_, j]=round(_P[j]*&nobs, 1);
                  cn+_Nmat[_index_, j];
               end;
               else _Nmat[_index_, j]=&nobs-cn;
               _Xmat[_index_, j]=_Nmat[_index_, j];
               _Kmat[_index_, j]=0;
            end;
        %end;
        put 'FINISH LOADING ALL DATA SETS';
        do i=1 to &ncomb;
            do k=0 to %eval(&n+1);               
                _N[k]=_Nmat[i, k]; put _Nmat[i, k]=;
            end;
            output _N;
        end;
     end;
     set &in_dsn  nobs=ntotal   end=eof_in;

     r=ranuni(&seed);
     %if &withcontrol=1 %then %do;
        do until (_iorc_=%sysrc(_dsenom));
            set _freq_  key=&idxname;
            if _iorc_^=%sysrc(_sok) then  _error_=0;        
        end;        
     %end;
     %else %do;
         _index_=1;
     %end;
     notfound=1;  j=1;

     do j=1 to &n;
         if notfound then do;
               if r<=_Fmat[_index_, j] & _Kmat[_index_, j]<_Nmat[_index_,j] then do;
                    _BLOCK=j; notfound=0; _Kmat[_index_, _BLOCK]+1; 
               end;
        end;        
     end;
     if notfound then do;  _BLOCK=&n; notfound=0; _Kmat[_index_, _BLOCK]+1; end;

     _Xmat[_index_, _BLOCK]=_Nmat[_index_, _BLOCK]-_Kmat[_index_, _BLOCK];
     _Nmat[_index_, %eval(&n+1)]+1; 

     if ^(eof_in ) then do;
        _temp_=(_Nmat[_index_, 0]-_Nmat[_index_, %eval(&n+1)]);
        if _temp_<=0 then _Pmat[_index_, _BLOCK]=0;
        else  _Pmat[_index_, _BLOCK]=_Xmat[_index_, _BLOCK]/_temp_;               
        do j=1 to &n;
           if j=1 then _Fmat[_index_, j]=_Pmat[_index_, j];
           else _Fmat[_index_, j]=_Fmat[_index_, j-1]+_Pmat[_index_, j];  
        end;
        do j=1 to &n;   
            if _Fmat[_index_, &n]<=0 then _Fmat[_index_, j]=0;
            else _Fmat[_index_, j]=_Fmat[_index_, j]/_Fmat[_index_, &n]; 
        end;
     end;
     else if (eof_in) then do;
        do i=1 to &ncomb;
            do k=1 to %eval(&n);               
                _N[k]=_Kmat[i, k]; put _Kmat[i, k]=;
            end;
            output _N;
        end;
     end;
     keep &originalvars  _BLOCK;
     output &out_dsn;
run;

%exit:
%mend;



*options mprint mlogic;
%put START TIME 1: %sysfunc(datetime(),datetime20.);

data test0;
     do ID=1 to 796773;        
        TDSP=min(3, round(ranuni(8976)*4));
        if ranuni(93745)<0.3 then VS='H'; else VS='M';
        output;
     end;
run;
%put START TIME 2: %sysfunc(datetime(),datetime20.);

%let seed=99999;
%let probvector=0.1485905 0.1485905 0.2324283 0.2351954 0.2351954;
%let control_vars=TDSP  VS;
%let in_dsn=test0;
%let out_dsn=test_out;
%let sort=NEST;
%split(&seed, &probvector, &control_vars, &in_dsn, out_dsn=&out_dsn);
*options nomprint nomlogic;
%put END TIME: %sysfunc(datetime(),datetime20.);

;;

proc freq data=&out_dsn noprint;
        tables _BLOCK*TDSP*VS /missing out=_freq_out_;
run;

2 comments:

drcabana said...

Hi,

I was wondering whether you could elaborate on what this code is intended to accomplish.

I gather there are two inputs. One is a probability vector, which I take to be a vector of non-negative weights which sum to one. The other input is a SAS table.

What I don't understand is what the desired outcome is. What does it mean to split a table according to a vector?

For instance, if the weights were .1, .2, .3, and .4, and the table consists of 1000 records each containing a single integer between 1 and 1000, no duplicates, what is the expected outcome?

Thank you.

L Xie said...

Some one on SAS-L asked how to split a SAS data set into N pieces given a N-by-1 probability vector, while the distribution of some given variables, TDSP and VS here, should be the same in the smaller files which equals the distribution of the original file.

To ensure randomness, I used K/N algorithm which built upon conditional probability of spliting. SAS-L had some extensive discussion on this methed several years.

What I did here is extending the K/N algorithm to take care of control variables. Taking any value combination of control variables as a strata, the randomness is archieved within this strata since the distribution of control variables are given by the original data set.