He has a data with two IDs:
id1 id2
a b
a e
b c
b e
c e
d e
and he wants to generate a new data set with the following structure according to above information :
id a b c d e
a 0 1 0 0 1
b 1 0 1 0 1
c 0 1 0 0 1
d 0 0 0 0 1
e 1 1 1 1 0
The real data is potentially big.
***************************;
At first look, this is a typical table look up problem SAS programmers facing almost everyday, that is duplicate keyed lookup table. It is a simple one because there is no inherent relationship among records.
data original;
input id1 $ id2 $;
datalines;
a b
a e
b c
b e
c e
d e
;
run;
proc datasets library=work nolist;
modify original;
index create id1 id2;
quit;
proc sql;
create table all_cases as
select a.*, monotonic() as seq
from (
select distinct id1 as id
from original
union
select distinct id2 as id
from original
) as a
order by a.id
;
quit;
proc sql noprint;
select id into :idnames separated by ' '
from all_cases
;
quit;
data new;
if _n_=1 then do;
declare hash _h(dataset:'all_cases');
_h.defineKey('id');
_h.defineData('seq');
_h.defineDone();
end;
set all_cases;
array _a{*} &idnames;
id1=id;
set original key=id1;
_mx_=%sysrc(_sok);
do while (_iorc_=%sysrc(_sok));
rc=_h.find(key:id2); if rc=0 then _a[seq]=1;
id1=id;
set original key=id1;
end;
_ERROR_=0;
id2=id;
set original key=id2;
do while (_iorc_=%sysrc(_sok));
rc=_h.find(key:id1); if rc=0 then _a[seq]=1;
id2=id;
set original key=id2;
end;
_ERROR_=0;
do j=1 to dim(_a); _a[j]=max(0, _a[j]); end;
keep id &idnames;
run;
On the other hand, this problem can be solved in a more SASsy way like this:
data original;
input id1 $ id2 $;
datalines;
a b
a e
b c
b e
c e
d e
;
run;
proc sql;
create table newx as
select a.id1, a.id2, (sum(a.id1=c.id1 & a.id2=c.id2)>0) as count
from
(select a.id as id1, b.id as id2
from all_cases as a, all_cases as b) as a
left join original as c
on a.id1=c.id1 or a.id2=c.id1
group by a.id1, a.id2
;
quit;
proc transpose data=newx out=_freq_t name=id2;
by id1;
var count;
id id2;
run;
data _freq_t;
set _freq_t;
array _n{*} _numeric_;
do i=1 to dim(_n);
_n[i]=(_n[i]>0);
end;
drop i;
run;
proc transpose data=_freq_t(drop=id2) out=_freq_t2 name=id1;
id id1;
run;
proc sql noprint;
select id1, count(distinct id1) into :covars separated by ' ', :count
from _freq_t;
quit;
data new2;
set _freq_t;
array _x{*} &covars;
array _x2{&count} _temporary_;
do j=1 to &count; _x2[j]=_x[j]; end;
set _freq_t2;
do j=1 to &count; _x[j]=(_x[j]+_x2[j]>0); end;
drop j id2;
run;
0 comments:
Post a Comment