||X-Y||2
Typically we have to manually code the search function in a DATA STEP, either using ARRAY or using HASH OBJ.
But if we are only care about the 1 closest point in lookup table, we can also tweak PROC FASTCLUS for a simple yet fast implementation. This is the same as 1-Nearest Neighbor calculation.
Here is an example with 2-dimension data and Euclidean Distance:
data fix;
input x y;
datalines;
1 3
2 4
3 5
8 0.2
15 1
;
run;
data have;
input x y;
datalines;
1.2 6
0.3 4
10 1.2
7 1
2.9 4
;
run;
data fix;
set fix;
CLUSTER=_n_;
run;
%let dsid=%sysfunc(open(fix));
%let ntotal=%sysfunc(attrn(&dsid, NOBS));
%let dsid=%sysfunc(close(&dsid));
proc fastclus data=have out=have2
seed=fix maxclusters=&ntotal
noprint maxiter=0 ;
var x y;
run;
/* BE SURE TO MERGE BACK TO SEE WHICH SEEDS WERE MATCHED */
proc sort data=fix; by cluster; run;
proc sort data=have2; by cluster; run;
data have2;
merge have2 fix(rename=(x=x2 y=y2));
by cluster;
run;
Comparing hash obj solution courtesy Paul Dorfman and my binary search approach:
104
105 data closest (drop = _:) ;
106 array _f [99999] _temporary_ ;
107 do _h = 2 by 1 until (z) ;
108 set fix end = z ;
109 _f[_h] = fix ;
110 end ;
111 _f[_h+1] = constant ("big") ;
112 do until (0) ;
113 set num ;
114 do _j = 2 by 1 until (_f[_j-1] <= num <= _f[_j]) ;
115 end ;
116 if _j = 2 then closest = _f[ 2] ;
117 else if _j = _h + 1 then closest = _f[_h] ;
118 else if sum (num, - _f[_j-1]) < sum (_f[_j], -num)
119 then closest = _f[_j-1] ;
120 else closest = _f[_j] ;
121 output ;
122 end ;
123 drop fix;
124 run ;
NOTE: There were 1000 observations read from the data set WORK.FIX.
NOTE: There were 1000000 observations read from the data set WORK.NUM.
NOTE: The data set WORK.CLOSEST has 1000000 observations and 4 variables.
NOTE: DATA statement used (Total process time):
real time 40.61 seconds
cpu time 40.64 seconds
125
126
127 %let dsid=%sysfunc(open(fix));
128 %let ntotal=%sysfunc(attrn(&dsid, NOBS));
129 %let dsid=%sysfunc(close(&dsid));
130 proc fastclus data=num(rename=(num=fix)) seed=fix cluster=CLASS
131 maxclusters=&ntotal noprint maxiter=0 out=num2;
132 var fix;
133 run;
NOTE: There were 1000 observations read from the data set WORK.FIX.
NOTE: The data set WORK.NUM2 has 1000000 observations and 4 variables.
NOTE: PROCEDURE FASTCLUS used (Total process time):
real time 6.41 seconds
cpu time 6.42 seconds
134
135 data compare;
136 merge closest(keep=j closest) num2(keep=j Class) end=eof;
137 by j;
138 retain d 0;
139 d+(closest ne Class);
140 if eof then put d=;
141 run;
d=0
NOTE: There were 1000000 observations read from the data set WORK.CLOSEST.
NOTE: There were 1000000 observations read from the data set WORK.NUM2.
NOTE: The data set WORK.COMPARE has 1000000 observations and 4 variables.
NOTE: DATA statement used (Total process time):
real time 3.92 seconds
cpu time 0.53 seconds
Binary search:
data have ;
input num @@ ;
cards ;
2.2 3 4.4 5 6.6 7 8.8 9 11.11 12.12 14.9 15.01
run ;
data fix ;
input num @@ ;
cards ;
3 6 9 12 15
run ;
%let dsid=%sysfunc(open(fix));
%let ntotal=%sysfunc(attrn(&dsid, NOBS));
%let dsid=%sysfunc(close(&dsid));
%put &ntotal;
proc sort data=fix; by num; run;
data have2;
array _f{1:&ntotal} _temporary_;
if _n_=1 then do;
do i=1 to &ntotal;
set fix point=i;
_f[i]=num;
end;
end;
set have;
L=0; U=&ntotal; break=0;
if num<=_f[1] then CLUSTER=_f[1];
else if num>=_f[&ntotal] then CLUSTER=_f[&ntotal];
else do;
do while (L < U & break=0);
mid=L+int((U-L)/2);
if _f[mid]=num then do; L=mid; break=1; end;
else do;
do while (L < U & break=0);
mid=L+int((U-L)/2);
if _f[mid]=num then do; L=mid; break=1; end;
else do;
if _f[mid]< num then L=mid+1;
else U=mid;
end;
end;
if mid=L then mid=max(1, mid-1);
if abs(num-_f[mid])< abs(num-_f[L]) then CLUSTER=_f[mid];
else CLUSTER=_f[L];
end;
put mid= L= U= num= CLUSTER=;
run;
Robustness of the PROC FASTCLUS solution:
the correctness of PROC FASTCLUS result is guranteed and demonstrated here.
Be sure to specify "maxiter=0" in the PROC FASTCLUS options.
sample code:
******************;
data have ;
input num @@ ;
cards ;
2.2 3 4.4 5 6.6 7 8.8 9 11.11 12.12 14.9 15.01
run ;
data fix ;
input num @@ ;
cards ;
3 6 9 12 15
run ;
data fix; set fix; CLUSTER=_n_; run;
data have; set have; id=_n_; run;
%let dsid=%sysfunc(open(fix));
%let ntotal=%sysfunc(attrn(&dsid, NOBS));
%let dsid=%sysfunc(close(&dsid));
proc fastclus data=have seed=fix
maxclusters=&ntotal noprint maxiter=0
out=have2 cluster=CLUSTER;
var num;
run;
proc sql;
create table have2 as
select a.*, b.num as fix
from have2 as a
join fix as b
on a.CLUSTER=b.CLUSTER
order by a.num
;
quit;
proc print data=have2; run;
****************;