Wednesday, August 04, 2010

Table Look Up in SAS, practical problems

One guy asked in a SAS forum about a typical table look up problem:
He has a data with two IDs:
id1 id2
a b
a e
b c
b e
c e
d e

and he wants to generate a new data set with the following structure according to above information :
id a b c d e
a 0 1 0 0 1
b 1 0 1 0 1
c 0 1 0 0 1
d 0 0 0 0 1
e 1 1 1 1 0

The real data is potentially big.
***************************;
At first look, this is a typical table look up problem SAS programmers facing almost everyday, that is duplicate keyed lookup table. It is a simple one because there is no inherent relationship among records.


data original;
   input id1 $ id2 $;
datalines;
a b
a e
b c
b e
c e
d e
;
run;

proc datasets library=work nolist;
     modify original;
  index create id1 id2;
quit;

proc sql;
     create table all_cases as
  select a.*, monotonic() as seq
  from (
  select distinct id1 as id
  from original
  union
  select distinct id2 as id
  from original
  ) as a
  order by a.id
  ;
quit;

proc sql noprint;
     select id into :idnames separated by ' '
  from   all_cases
  ;
quit;

data new;
  if _n_=1 then do;
     declare hash _h(dataset:'all_cases');
     _h.defineKey('id');
     _h.defineData('seq');
     _h.defineDone();
     end; 
     set all_cases;

  array _a{*} &idnames; 

  id1=id;  
  set original key=id1;      
  _mx_=%sysrc(_sok);
  
  do while (_iorc_=%sysrc(_sok));   
     rc=_h.find(key:id2); if rc=0 then _a[seq]=1;
  id1=id;
     set original key=id1;  
  
  end;
  _ERROR_=0;
  
  id2=id;  
  set original key=id2;      
  do while (_iorc_=%sysrc(_sok)); 
     rc=_h.find(key:id1); if rc=0 then _a[seq]=1;
  id2=id;
     set original key=id2;  
  end;
  _ERROR_=0;
  do j=1 to dim(_a); _a[j]=max(0, _a[j]); end;
  keep id &idnames;
run;

On the other hand, this problem can be solved in a more SASsy way like this:


data original;
   input id1 $ id2 $;
datalines;
a b
a e
b c
b e
c e
d e
;
run;

proc sql;
     create table newx as
     select a.id1, a.id2, (sum(a.id1=c.id1 & a.id2=c.id2)>0) as count
     from   
       (select a.id as id1, b.id as id2 
        from all_cases as a, all_cases as b) as a
left join   original as c
       on   a.id1=c.id1 or a.id2=c.id1
    group by a.id1, a.id2
    ;
quit;

proc transpose data=newx  out=_freq_t name=id2;
     by id1;
     var count;
     id id2;
run;

data _freq_t;
     set _freq_t;
     array _n{*} _numeric_;
     do i=1 to dim(_n);
        _n[i]=(_n[i]>0);
     end;
     drop i;
run;

proc transpose data=_freq_t(drop=id2) out=_freq_t2  name=id1;
     id id1;
run;

proc sql noprint;
     select id1, count(distinct id1) into :covars separated by ' ', :count
     from   _freq_t;  
quit;

data new2;
     set _freq_t;
     array _x{*} &covars;
     array _x2{&count} _temporary_;

     do j=1 to &count; _x2[j]=_x[j]; end;
     set _freq_t2;
     do j=1 to &count; _x[j]=(_x[j]+_x2[j]>0); end;
     drop j id2;
run;

0 comments: