Wednesday, September 23, 2009

Tweak PROC FASTCLUS for 1-Nearest Neighbor / Closest Match

In most table lookup tasks, we are doing EXACT matching. However, sometimes we are looking for closest match in the lookup table. By 'closest', we mean smallest Eucleadian distance:


||X-Y||2

Typically we have to manually code the search function in a DATA STEP, either using ARRAY or using HASH OBJ.

But if we are only care about the 1 closest point in lookup table, we can also tweak PROC FASTCLUS for a simple yet fast implementation. This is the same as 1-Nearest Neighbor calculation.

Here is an example with 2-dimension data and Euclidean Distance:

data fix;
input x y;
datalines;
1 3
2 4
3 5
8 0.2
15 1
;
run;
data have;
input x y;
datalines;
1.2 6
0.3 4
10 1.2
7 1
2.9 4
;
run;
data fix;
   set fix;
   CLUSTER=_n_;
run;

%let dsid=%sysfunc(open(fix));
%let ntotal=%sysfunc(attrn(&dsid, NOBS));
%let dsid=%sysfunc(close(&dsid));
proc fastclus data=have out=have2
              seed=fix  maxclusters=&ntotal 
              noprint maxiter=0 ;
     var x y;
run;

/* BE SURE TO MERGE BACK TO SEE WHICH SEEDS WERE MATCHED */
proc sort data=fix; by cluster; run;
proc sort data=have2; by cluster; run;
data have2;
     merge have2   fix(rename=(x=x2  y=y2));
     by cluster;
run;


Comparing hash obj solution courtesy Paul Dorfman and my binary search approach:


104
105 data closest (drop = _:) ;
106 array _f [99999] _temporary_ ;
107 do _h = 2 by 1 until (z) ;
108 set fix end = z ;
109 _f[_h] = fix ;
110 end ;
111 _f[_h+1] = constant ("big") ;
112 do until (0) ;
113 set num ;
114 do _j = 2 by 1 until (_f[_j-1] <= num <= _f[_j]) ;
115 end ;
116 if _j = 2 then closest = _f[ 2] ;
117 else if _j = _h + 1 then closest = _f[_h] ;
118 else if sum (num, - _f[_j-1]) < sum (_f[_j], -num)
119 then closest = _f[_j-1] ;
120 else closest = _f[_j] ;
121 output ;
122 end ;
123 drop fix;
124 run ;



NOTE: There were 1000 observations read from the data set WORK.FIX.
NOTE: There were 1000000 observations read from the data set WORK.NUM.
NOTE: The data set WORK.CLOSEST has 1000000 observations and 4 variables.
NOTE: DATA statement used (Total process time):
real time 40.61 seconds
cpu time 40.64 seconds

125
126
127 %let dsid=%sysfunc(open(fix));
128 %let ntotal=%sysfunc(attrn(&dsid, NOBS));
129 %let dsid=%sysfunc(close(&dsid));
130 proc fastclus data=num(rename=(num=fix)) seed=fix cluster=CLASS
131 maxclusters=&ntotal noprint maxiter=0 out=num2;
132 var fix;
133 run;

NOTE: There were 1000 observations read from the data set WORK.FIX.
NOTE: The data set WORK.NUM2 has 1000000 observations and 4 variables.
NOTE: PROCEDURE FASTCLUS used (Total process time):
real time 6.41 seconds
cpu time 6.42 seconds

134
135 data compare;
136 merge closest(keep=j closest) num2(keep=j Class) end=eof;
137 by j;
138 retain d 0;
139 d+(closest ne Class);
140 if eof then put d=;
141 run;

d=0

NOTE: There were 1000000 observations read from the data set WORK.CLOSEST.
NOTE: There were 1000000 observations read from the data set WORK.NUM2.
NOTE: The data set WORK.COMPARE has 1000000 observations and 4 variables.
NOTE: DATA statement used (Total process time):
real time 3.92 seconds
cpu time 0.53 seconds


Binary search:



data have ;
input num @@ ;
cards ;
2.2 3 4.4 5 6.6 7 8.8 9 11.11 12.12 14.9 15.01
run ;

data fix ;
input num @@ ;
cards ;
3 6 9 12 15
run ;


%let dsid=%sysfunc(open(fix));
%let ntotal=%sysfunc(attrn(&dsid, NOBS));
%let dsid=%sysfunc(close(&dsid));
%put &ntotal;
proc sort data=fix; by num; run;
data have2;
array _f{1:&ntotal} _temporary_;
if _n_=1 then do;
do i=1 to &ntotal;
set fix point=i;
_f[i]=num;
end;
end;
set have;
L=0; U=&ntotal; break=0;
if num<=_f[1] then CLUSTER=_f[1];
else if num>=_f[&ntotal] then CLUSTER=_f[&ntotal];
else do;
do while (L < U & break=0);
mid=L+int((U-L)/2);
if _f[mid]=num then do; L=mid; break=1; end;
else do;
    do while (L < U & break=0);
         mid=L+int((U-L)/2);
    if _f[mid]=num then do; L=mid; break=1; end;
    else do;
            if _f[mid]< num then L=mid+1;
      else U=mid;
    end;
       end;
    if mid=L then mid=max(1, mid-1);
       if abs(num-_f[mid])< abs(num-_f[L]) then CLUSTER=_f[mid];
    else CLUSTER=_f[L];
  end;

put mid= L= U= num= CLUSTER=;
run;



Robustness of the PROC FASTCLUS solution:
the correctness of PROC FASTCLUS result is guranteed and demonstrated here.


Be sure to specify "maxiter=0" in the PROC FASTCLUS options.

sample code:



******************;
data have ;
input num @@ ;
cards ;
2.2 3 4.4 5 6.6 7 8.8 9 11.11 12.12 14.9 15.01
run ;

data fix ;
input num @@ ;
cards ;
3 6 9 12 15
run ;

data fix; set fix; CLUSTER=_n_; run;
data have; set have; id=_n_; run;


%let dsid=%sysfunc(open(fix));
%let ntotal=%sysfunc(attrn(&dsid, NOBS));
%let dsid=%sysfunc(close(&dsid));
proc fastclus data=have seed=fix 
              maxclusters=&ntotal noprint maxiter=0
              out=have2 cluster=CLUSTER;
     var num;
run;

proc sql;
create table have2 as
select a.*, b.num as fix
from have2 as a
join fix as b
on a.CLUSTER=b.CLUSTER
order by a.num
;
quit;

proc print data=have2; run;
****************;

0 comments: