%macro trc(data=,var_list=,gew=,rob_reg=,qkrit=,list=,out=,mdist=,dilim=);

/*****************************************************************************
 Macro-Parameters   :
  data=         Input sas data set                               (required)

  var_list=     variable list  (max 20)                          (required)
                full list of Var_Names, as : var_list=x1 x2 x3 .......

  gew=          name of sampling weight variable                 (optional)
                if none a variable w=1 for every observation is created

  rob_reg=      robust regression method METH/IRLS/LMS/LTS       (default : IRLS)
                 LMS & LTS are without sampling weight variable

  qkrit=        quality criterion parameter in (0,1)             (default : 0.1)

  list=         list_output 0,1,2 : none,medium,full             (default : 1)

  out=          Output sas data set                              (default : _out_)

  mdist=        conditional or marginal Mahalanobis distance 1,2 (default : 1)

  dilim=        lower limit for outlier indicator variable u     (default : none)
                observations with a di value >= dilim will be declared as outliers,
                i.e u=0 and u=1 for non-outliers
                TRC could be executed again, with a limit value, then only
                the output file will be modified - without recalculating the
                the Mahalanobis distances - by adding the outlier indicator variable u

 example :
  %trc(data=<libref>.bushfire20w,var_list=v1 v2 v3 v4 v5,gew=w,
            rob_reg=irls,qkrit=0.5,list=2,out=<libref>.bushfiretrc,dilim=);

 -----------------------------------------------------------------------------
 Version 1.3 , Sep-2004
  - fixed a problem with datasets with only non missing values

 SAS Modules : Base, IML, Graph  / Version 8.2 or higher
 Copyright   : Swiss Federal Statistical Office (SFSO), B. Hulliger & C. Beguin
               Swiss Federal Office of Information Technology, M. Eichenberger
 Reference   : Euredit D4/5.2.1.C, B. Hulliger & C. Beguin, SFSO
*****************************************************************************/

options nodate pageno=1 nonumber  ps=50 ls=90;
title1
  "Robust Multivariate Outlier Detection & Imputation with incomplete Survey Data: TRC ";
%global dsexist varexist mflag mvar ;
%if &mdist=  or &mdist < 1 or &mdist > 2  %then %let mdist=1;
%if &mdist=1 %then %let mdtxt=conditional; %else %let mdtxt=marginal;
%if &list= or &list < 0 or &list > 3 %then %let list=1;
%if &out=  %then %let out=_out_;
%if %upcase(&rob_reg) ne LMS and %upcase(&rob_reg) ne LTS
    and %upcase(&rob_reg) ne METH %then %let rob_reg=IRLS;
proc datasets lib=work;
 delete impu_m impu impuf;
run;

%macro exist(data);
%let dsexist=no;
data _null_;
  if 0 then set &data;
  stop;
run;
%if &syserr=0 %then %let dsexist=yes;
%mend exist;

%macro varex(data,var);
%let varexist=no;
data _varex_;
  set &data;
  if _n_=2 then stop;
run;
data _null_;
  set _varex_(keep=&var obs=0);
run;
%if &syserr=0 %then %let varexist=yes;
%mend;

%exist(&data);
%if &dsexist=no    %then %do;
    data _null_;
    file print;
    put #3 @2 " Program terminated : No File  &data ";
    run;
    %goto ende_x;
%end;
      /* outlier indicator var u, if dilim is specified */
%if &dilim ne   %then %do;
%exist(&out);
 %if &dsexist = no  %then %do;
    data _null_;
    file print;
    put #3 @2
   "No Output file, TRC will be executed first, than the outlier indicator can be created";
    put #4 @2
   "Wait till TRC has finished, rerun TRC, dilim= must have a value for outlier definition";
    run;
 %end;
 %if &dsexist = yes  %then %do;
   data &out;
       set &out;
       u=1;
       if  di >= &dilim then u=0;
   run;
   data _null_;
    file print;
    put #3 @2
   "In Output file &out a outlier indicator u=0 has been added for DI >= &dilim";
   run;
   %goto ende_x;
 %end;
%end;

%let nvar=0;
%let gewfl=1;
%if &qkrit eq  or &qkrit <= 0 or &qkrit >= 1 %then %let qkrit=0.1;
%let gewicht=;

%if &gew ne %then %do;
 %varex(&data,&gew);
   %if &varexist=no    %then %do;
     data _null_;
     file print;
     put #3 @2 " Program terminated : No Weight_Var &gew in file &data ";
     run;
     %goto ende_x;
   %end;
   %else %let gewicht=weight &gew;
%end;

%if &gew eq %then %do;
 %let gewfl=0;
 %let gew=w;
 %let gewicht=weight w;
%end;
%do i=1  %to  20;
 %let s=%scan(&var_list,&i,' ');
  %if &s ne %then %do;
    %let var&i=&s;
    %varex(&data,&&var&i);
      %if &varexist=no    %then %do;
       data _null_;
       file print;
       put #3 @2 " Program terminated : No Var  &&var&i in file &data ";
       run;
       %goto ende_x;
      %end;
    %let nvar=&i;
  %end;
%end;

data daten;
 set &data;
 %if &gewfl=0 %then %do;
  w=1;
 %end;
run;

  /* elimination of obs with missing values in every var */
data daten;
 set daten;
  if
  %do i=1  %to  %eval(&nvar-1);
    &&var&i eq .  and
  %end;
  &&var&nvar eq . then delete ;
run;

 /* Macro for weighted Median & MAD  */
%macro mad(data=,var_list=,nvar=,gew=,out1=,out2=);
%let mflag=0;
proc means data = &data  median noprint;
 var &var_list;
 &gewicht;
 output out = &out1(drop=_type_ _freq_)  median =
  %do i=1 %to &nvar;
   med_&&var&i
  %end;
run;
data _null_;
 set  &out1;
 if _n_ =1 then do;
  %do i=1 %to &nvar;
   call symput('median'!!left(&i),med_&&var&i);
  %end;
 end;
run;
data temp;
  set &data;
  %do i=1 %to &nvar;
   r&i = abs(&&var&i - &&median&i);
  %end;
run;
proc means data= temp median noprint;
  var
    %do i=1 %to &nvar;
      r&i
    %end;
      ;
   &gewicht;
  output out=&out2(drop=_type_ _freq_) median=
    %do i=1 %to &nvar;
      mad_r&i
    %end;
       ;
run;
data &out2;
 set &out2;
    %do i=1 %to &nvar;
      mad_r&i = mad_r&i/0.6745;
      if mad_r&i = 0 then do;
         call symput('mflag',1); call symput('mvar',"&&var&i");
      end;
    %end;
       ;
run;
%mend;

        /* evaluation median and mad for data with sampling weight   */
%mad(data=daten,var_list=&var_list,nvar=&nvar,gew=&gewicht,out1=med1,out2=mad1);

%if &mflag=1   %then %do;
    data _null_;
    file print;
    put #3 @2 "Program terminated :  variable &mvar has mad = 0 (singular matrix S1~) ";
    run;
    %goto ende_x;
%end;

proc iml  workspace=16384;
reset noname;
start gew_median(x,w,med);
     a=x;
     b=w;
     r=rank(x);
     x[r,] = a;
     w[r,] = b;
     u=t(unique(x));
     nu=nrow(u);
     sf=j(nu,1,.);
     med=j(1,1,.);
      do i=1 to nu;
        s=u[i];
        sa=loc(x<=s);
        sf[i]=sum(w[sa]);
      end;
      sf2=(sf[nu])/2;
      if sf2 < sf[1] then do;
        ind = 1;
      end;
      else do;
        ind=max(loc(sf<sf2)) + 1 ;
      end;
      x= a;  w = b;
     med=u[ind];
 finish;

 varname1={
           %do i=1 %to &nvar;
             "&&var&i"
           %end;
           };
 varname2={
           %do i=1 %to &nvar;
             "&&var&i"
           %end;
             "&gew"
           };
 varname3={ "Obs_Nr" "dep_var" "regressor" "slope" "intercept" "imputed" "n obs" "cor"};
 impu_miss = j(1,8,' ');
 create impu_m from impu_miss [colname=varname3];
 close impu_m;
use daten;
read all var{&var_list} into dat;
 nrd=nrow(dat);
 ncd=ncol(dat);
use daten;
read all var{&gew} into w;
use mad1;
read all  into  mad;

rid=j(nrd,ncd,1);
if any(dat = .) then do;
miss = loc(dat = .);
rid[miss]=0;
end;
       /* Spearman Rankcorrelation */
corr=j(ncd,ncd,.);
do k=2 to ncd;
 x=dat[,k];
 rx=rid[,k];
 k1=k-1;
 do m=1 to k1;
   y=dat[,m];
   ry=rid[,m];
   swxy=0;
   sw=0;
   wxy=rx#ry#w;
    do i=1 to nrd;
     a=x[i];
     b=y[i];
     if a ^= . & b ^= . then do;
     ssx1=0; ssx2=0; ssy1=0; ssy2=0;
     if any(x < a) then do;
        sx1=loc(x<a);
        ssx1=sum(wxy[sx1]);
     end;
        sx2=loc(x=a);
        ssx2=sum(wxy[sx2]);
     if any(y < b) then do;
        sy1=loc(y<b);
        ssy1=sum(wxy[sy1]);
     end;
        sy2=loc(y=b);
        ssy2=sum(wxy[sy2]);
     sx=ssx1 + 0.5#ssx2 + 0.5;
     sy=ssy1 + 0.5#ssy2 + 0.5;
     swxy=swxy+wxy[i]#(sx#sy);
     sw=sw+wxy[i];
     end;
    end;
   co=(12/(sw#sw#sw))#swxy - 3;
   corr[k,m]=min(max(co,-1),1);
   corr[m,k]=corr[k,m];
 end;
end;
do k=1 to ncd;
   corr[k,k]=1.;
end;
dmad=diag(mad);
rcorr=2#sin(0.5235988#corr);
scorr=dmad*rcorr*dmad;
create corr from corr   [colname=varname1];
 append from corr;
create rcorr from rcorr [colname=varname1];
 append from rcorr;
create scorr from scorr [colname=varname1];
 append from scorr;

   /* imputation with robust regression */
imflag=1;

   /* functions for IRLS imputation     */
start reg(x,y,gw,nr,cf,res);
 intc=shape(1,nr,1);
 tgw=gw`;
 tgw=shape(tgw,2,nr);
 xx=x||intc;
 cf=inv((xx`#tgw)*xx)*((xx`#tgw)*y);
 yp=cf[1,1]#x + cf[2,1];
 res=y-yp;
finish;

start gewi(res,wr,nr,mad);
scaleres=abs(res/mad);
do i=1 to nr;
 if  abs(res[i]) <= 1.345*mad then wr[i] = 1;
 if  abs(res[i]) >  1.345*mad then wr[i] = 1.345/scaleres[i];
end;
finish;

start mad(res,mad);
mres=median(res);
absmres=abs(res-mres);
mad=median(absmres)/0.6745;
finish;

   /* begin of imputation process  */
f_noimpu = 0;
if any(dat = .) then do;
 slope=j(ncd,ncd,.);
 intc =j(ncd,ncd,.);
 b=j(1,2,.);
 optn = j(8,1,.);
 optn[1]= 0; optn[2]=0; optn[3]=0; optn[7]=0; optn[8]=1;
 %if %upcase(&rob_reg) eq LMS %then %do;
  optn[5] = 10;
 %end;
 datim=dat;
 rid[miss]=.;
 impu=j(1,8,'                                ');
 do j=1 to ncd;
  qflag = j(1,ncd,.);
  do jj=1 to ncd;
   if  jj ^= j then do;
     cs=sum(rid[,j]#rid[,jj]);
     if cs > &qkrit#nrd then do;
       qflag[1,jj] = 1;
     end;
   end;
  end;

  do i=1 to nrd;
   if dat[i,j] = .  then do;
      vcf=j(1,ncd,0);
      v=abs(corr[j,]);
      c=rid[i,];
      vcf=v#c#qflag;
    if any(vcf ^= .) then do;
      m=max(vcf);
      j0=max(loc(vcf=m));
      datr=dat[,j]||dat[,j0]||w;
      missr = loc(datr = .);
      rows=1+floor((missr-1)/3);
      rows=unique(rows);
      keep=remove(1:nrow(datr),rows);
      datr=datr[keep,];
      y=datr[,1];
      x=datr[,2];
      nr=nrow(y);

      %if %upcase(&rob_reg) eq METH  %then %do;
       wr=datr[,3];
       if slope[j,j0] = . then do;
        run gew_median(y,wr,medy);
        run gew_median(x,wr,medx);
        my=abs(y-medy);
        mx=abs(x-medx);
        run gew_median(my,wr,sigy);
        run gew_median(mx,wr,sigx);
        rho=rcorr[j,j0];
        slope[j,j0] = rho#(sigy/sigx);
        res=y-slope[j,j0]#x;
        run gew_median(res,wr,alfa);
        *alfa=medy - slope[j,j0]#medx;
        intc[j,j0] = alfa;
       end;
        b[1,1]=slope[j,j0];
        b[1,2]= intc[j,j0];
      %end;

      %if %upcase(&rob_reg) eq IRLS  %then %do;
       wd=datr[,3];
        wr=j(nr,1,1);
        gw=wd#wr;
        run reg(x,y,gw,nr,cf,res);
        run mad(res,mad);
        delta=1.;
        do  it=1 to 10 while(delta > 0.05);
         cfold=cf;
         run gewi(res,wr,nr,mad);
         gw=wd#wr;
         run reg(x,y,gw,nr,cf,res);
          delta=abs(cfold[1,1]-cf[1,1])+abs(cfold[2,1]-cf[2,1]);
           b[1,1]=cf[1,1];   b[1,2]=cf[2,1];
        end;
      %end;

      %if %upcase(&rob_reg) eq LMS or %upcase(&rob_reg) eq LTS %then %do;
         call &rob_reg(sc,coef,wgt,optn,y,x);
         b=coef[1,];
      %end;
      datim[i,j] = b[1,1]#dat[i,j0] + b[1,2];
      impu[1,1] = char(i,6); impu[1,2]=varname1[1,j]; impu[1,3]=varname1[1,j0];
      impu[1,4] = char(b[1,1],12,6); impu[1,5]=char(b[1,2],12,6);
      impu[1,6] = char(datim[i,j],12,4); impu[1,7]=char(nr,6);
      impu[1,8] = char(corr[j0,j],12,4);
    end;
    else do;
      imflag=0;
      print "No Imputation possible (perhaps quality criterion too severe) ";
      abort;
    end;
     imputation=imputation//impu;
   end;
  end;
 end;
 create impu from imputation [colname=varname3];
        append from imputation;
 dat=datim;
 free datim;
end;
else do;
 f_noimpu=1;
end;
create f_imp from f_noimpu;
   append from f_noimpu;
   /* eigen values: diagonal (d), orthogonal (q) matrix q*d*q`=scorr & data transf. */
if imflag = 1 then do;
 call eigen(d,q,scorr);
 tdat=dat*q;
 tdat=tdat||w;
 create q    from q    [colname=varname1];
   append from q;
 create tdat from tdat [colname=varname2];
   append from tdat;
end;
run; quit;

data _null_;
 set f_imp;
 call symput('f_noimpu',left(col1));
run;

    /* evaluation of median and mad for transformed data (dat*q)  */
%mad(data=tdat,var_list=&var_list,nvar=&nvar,gew=&gewicht,out1=med2,out2=mad2);

proc iml  workspace=16384;
reset noname;
varname1={
           %do i=1 %to &nvar;
             "&&var&i"
           %end;
           };
varname= {
           %do i=1 %to &nvar;
             "&&var&i"
           %end;
             "&gew"
             "Mahalanobis"
           };
use q;
 read all  into  q;
use med2;
 read all  into  med;
use mad2;
 read all  into  mad;
use daten;
read all var{&var_list} into dat;
 nrd=nrow(dat);
 ncd=ncol(dat);
use daten;
read all var{&gew} into w;

madd=diag((mad#mad));
mtil=(q*med`)`;
sigma=q*madd*q`;
isigma=inv(sigma);
rid=j(nrd,ncd,1);

if any(dat = .) then do;
miss = loc(dat = .);
rid[miss]=0;
end;
create scov from sigma [colname=varname1];
 append from sigma;
create center from mtil [colname=varname1];
 append from mtil;

       /* Mahalanobis Distances */
md=j(nrd,1,.);
       /* conditional version */
%if &mdist = 1 %then %do;
do i=1 to nrd;
f=ncd/rid[i,+];
r=diag(rid[i,]);
x=dat[i,]#rid[i,];
xm=x-mtil;
xmt=xm`;
irs=r*isigma*r;
md[i]=f#(xm*irs*xmt);
end;
%end;
       /* marginal version */
%if &mdist = 2 %then %do;
e=j(ncd,ncd,1);
do i=1 to nrd;
f=ncd/rid[i,+];
r=diag(rid[i,]);
x=dat[i,]#rid[i,];
if f = 1 then do;
  xm=x-mtil;
  xmt=xm`;
  md[i]=xm*isigma*xmt;
end;
else do;
  re=r*e*r;
  zero = loc(re=0);
  nzero=ncol(zero);
  nm=sqrt((ncd*ncd)-nzero);
  rs=remove(sigma,zero);
  rs=shape(rs,nm,nm);
  irs=inv(rs);
  xm=x-mtil;
  zero = loc(rid[i,]=0);
  nzero=ncol(zero);
  nm=ncd-nzero;
  xm=remove(xm,zero);
  xm=shape(xm,nm);
  xmt=xm`;
  md[i]=f#(xmt*irs*xm);
end;
end;
%end;

dat=dat||w||md;
create &out from dat [colname=varname];
 append from dat;
run; quit;

data &out;
  set &out;
  log_maha=log(mahalanobis);
  obs_nr = _n_;
run;

proc sort data=&out;
     by mahalanobis;
run;

proc means data = &out  median noprint;
 var mahalanobis;
 output out = _tmp_ median = median
 run;
data _null_;
  set _tmp_;
  call symput('mdmed',left(put(median,8.)));
run;

proc means data=daten  noprint;
     var &var_list;
     output out=anzmiss  nmiss=&var_list;
run;

data anzmiss;
 set anzmiss;
 nmisst=sum(of &var_list);
 ncases=(&nvar)*(_freq_);
 proz=(nmisst/ncases)*100;
 call symput('proz',left(put(proz,5.1)));
 call symput('anzobs',left(put(_freq_,8.)));
run;

data &out;
  set &out;
  xf=finv(_n_/(&anzobs+1),&nvar,&anzobs-&nvar);
  di=finv(0.5,&nvar,&anzobs-&nvar)*(mahalanobis/&mdmed);
run;

data impuf;
 set
%if &f_noimpu = 0 %then %do;
   impu
%end;
   impu_m;
run;

proc sort data=impuf;
     by obs_nr;
run;

  /*  output   */
%if &list ne 0 %then %do;
%if &sysscp=WIN %then %do;
 goptions reset=all gunit=pct ctext=black htext=2  ftext=swiss dev=win;
%end;
%else %do;
 goptions reset=all gunit=pct ctext=black htext=2  ftext=swiss dev=xcolor;
%end;

 symbol1 i=none v=dot cv=red h=1;
 symbol2 i=join l=1 ci=black v=none w=2;
 axis1   c=black w=2 style=1 value=(f=swiss h=2.0) label=(" ");
 axis2   c=black w=2 style=1
         label=(f=swiss h=2.5 "DI") value=(f=swiss h=2.0);

 title1 f=swiss h=2.8 pct
  "Robust Multivariate Outlier Detection & Imputation with incomplete Survey Data: TRC ";
 title2 " ";
 title3 f=swiss h=2.3 pct
  "missing values: &proz %   N of Obs: &anzobs  Input File: &data   Output File: &out" ;
 title4 f=swiss h=2.3 pct
  "Imputation method : &rob_reg - Mahalanobis distance : &mdtxt - quality crit : &qkrit";

 title5 " ";
 title6 "Spearman Rank Correlation R";
 proc print data=corr  uniform noobs ;
      format _all_  11.5;
 run;
 title5 " ";
 title6 "Spearman Rank Correlation R~ = 2*sin(Pi/6 * R)";
 proc print data=rcorr uniform noobs ;
      format _all_  11.5;
 run;
 title5 " ";
 title6 "Symmetric Matrix S1~ = mad * R~ * mad";
 proc print data=scorr uniform noobs ;
      format _all_  11.5;
 run;
 title5 " ";
 title6 "Covariance  Matrix S~ (scatter matrix for Mahalanobis distances)";
 proc print data=scov  uniform noobs ;
      format _all_  11.5;
 run;
 title5 " ";
 title6 "Center vector m";
 proc print data=center  uniform noobs ;
      format _all_  11.5;
 run;
 proc freq data=impuf  noprint ;
   tables dep_var*regressor*cor / out=tabimp(drop=percent) nopercent;
 run;
 title5 " ";
 title6 "Imputation Statistics for the variables";
 proc print data=tabimp uniform noobs label;
      label count="N of obs imputed"
            dep_var="Variable"
            regressor=" using regressor"
            cor="Correlation";
      var   dep_var count regressor cor;
      format count 18.;
 run;

 %if &list eq 2 %then %do;
 title5 "Imputation History";
 proc print data=impuf  uniform noobs label;
      var obs_nr dep_var regressor slope intercept imputed n_obs;
      label n_obs="N of obs"
 run;
 %end;

 data plot;
  set &out;
  zf=xf;
 run;

 proc gplot data=plot;
 title5 " ";
   plot (di zf) * xf / overlay grid
      haxis=axis1
      vaxis=axis2
      frame;
 run; quit;
%end;

proc sort data=&out;
     by obs_nr;
run;

%ende_x:
%mend;
