Re: [Perldl] UniqueND

Henning Glawe Thu, 20 Sep 2012 15:24:24 -0700

On Thu, Sep 20, 2012 at 01:13:42PM -0400, Chris Marshall wrote:
> There are no known bugs with either.
> 
> If you have a test case showing a failure,
> please report it.
> 
> Note:  The comparisons are strictly
> numeric so if you need some sort of
> approximate equality threshold for
> floating point, you would need to do
> the conversion yourself.


In fact, I have written some code implementing some of PDL::Ufunc
sorting/uniq functions with thresholds. Did not spend too much time on
updating the documentation/copyright info, but you can try the attached PP
code.

-- 
c u
henning

use strict; # be careful

pp_addpm({At=>'Top'},<<'EOD');

=head1 NAME

PDL::Approx - primitive approximate ufunc operations for pdl

=head1 DESCRIPTION

This module provides approximate vector sorting functions. Approximate
means in this case: component-wise comparison respecting a user-supplied
epsilon. This is particularly useful for PDLs of floating point types.

=head1 SYNOPSIS

 use PDL::Ufunc;

=cut

use PDL::Slices;
use Carp;

EOD

# check for bad value support
use PDL::Config;
my $bvalflag = $PDL::Config{WITH_BADVAL} || 0;


# should we use the finite() routine in libm ?
# (is the Windows version _finite() ?)
#
pp_addhdr(<<'EOD');
#define IsNaN(x) (x != x)
EOD

# Internal utility sorting routine for approx_qsortvec routines.
#
# note: we export them to the PDL Core structure for use in
#       other modules (eg Image2D)

for (keys %PDL::Types::typehash) {
    my $ctype = $PDL::Types::typehash{$_}{ctype};
    my $ppsym = $PDL::Types::typehash{$_}{ppsym};

    pp_addhdr(<<"FOO"

        /*******
         * approx_qsortvec helper routines
         *   --HG 07-May-2012
         */

        /* per-type epsilons for approximate comparisons
        */
        $ctype approx_epsilon_$ctype=0;
        /* Compare a vector in lexicographic order, returning the
         *  equivalent of "<=>". 
         */
      signed char pdl_approx_cmpvec_$ppsym($ctype *a, $ctype *b, int n) {
        int i;
        for(i=0; i<n; a++,b++,i++) {
         if( (*a-*b) < -approx_epsilon_$ctype ) return -1;
         if( (*a-*b) >  approx_epsilon_$ctype ) return 1;
        }
        return 0;
     }  
        
      void pdl_approx_qsortvec_$ppsym($ctype *xx, int n, int a, int b) {
                
        int i,j, median_ind;

        $ctype t;
        i = a; 
        j = b;

        median_ind = (i+j)/2;

        do {
          while( pdl_approx_cmpvec_$ppsym( &(xx[n*i]), &(xx[n*median_ind]), n ) 
 <  0 ) 
                i++;
          while( pdl_approx_cmpvec_$ppsym( &(xx[n*j]), &(xx[n*median_ind]), n ) 
 >  0 ) 
                j--;
          if(i<=j) {
                int k;
                $ctype *aa = &xx[n*i];
                $ctype *bb = &xx[n*j];
                for( k=0; k<n; aa++,bb++,k++ ) {
                  $ctype z;
                  z = *aa;
                  *aa = *bb;
                  *bb = z;      
                }

                if (median_ind==i)
                  median_ind=j;
                else if (median_ind==j)
                  median_ind=i;

                i++; 
                j--;
          }
        } while (i <= j);
        
        if (a < j)
          pdl_approx_qsortvec_$ppsym( xx, n, a, j );
        if (i < b)
          pdl_approx_qsortvec_$ppsym( xx, n, i, b );
        
      }

          void pdl_approx_bsortvec_$ppsym($ctype *xx, int n, int l) {
        int min_idx,max_idx;
        int first_swap,last_swap;
        int i,k;
                $ctype z;
        min_idx = 0;
        max_idx = l;
        last_swap = 42;
        while (last_swap >= 0) {
          last_swap  = -1;
//                printf("  %5d %5d     %5d\\n",min_idx,max_idx,n);
//                fflush(stdout);
          for (i=min_idx;i<max_idx;i++) {
            if (pdl_approx_cmpvec_$ppsym( &(xx[n*i]), &(xx[n*(i+1)]), n ) > 0) {
              if (last_swap < 0) {
                min_idx = (i > 1 ? i-1 : 0);
              }
                          last_swap = i;
                      $ctype *aa = &xx[n*i];
                  $ctype *bb = &xx[n*(i+1)];
              for( k=0; k<n; aa++,bb++,k++ ) {
                z = *aa;
                *aa = *bb;
                *bb = z;
              }
                        }
                  }
                  max_idx=last_swap+1;
                }
      }

          void pdl_approx_isortvec_$ppsym($ctype *xx, $ctype *yy, int n, int l) 
{
                int offset_in[l+1];
                int offset_sorted[l+1];
                int i,j,k,o,cmpres;
                int mv_start;
                for (i=0,o=0;i<=l;i++,o+=n)
                  offset_in[i]=o;
                for (i=0,o=0;i<=l;i++,o+=n)
                  offset_sorted[i]=-1;
                offset_sorted[0]=offset_in[0];
                for (i=1;i<=l;i++) {
          for (j=i;j>0;j--) {
            cmpres=pdl_approx_cmpvec_$ppsym( &(xx[offset_in[i]]), 
&(xx[offset_sorted[j-1]]), n );
            if (cmpres >= 0)
                          break;
          }
                  mv_start = (j > 0 ? j : 1);
                  for (k=i;k>=mv_start;k--) {
                    offset_sorted[k]=offset_sorted[k-1];
                  }
                  offset_sorted[j]=offset_in[i];
        }
                $ctype *aa = yy;
                for (i=0;i<=l;i++) {
                  $ctype *bb = &xx[offset_sorted[i]];
          for (j=0;j<n;j++,aa++,bb++) {
                    *aa = *bb;
                  }
        }
      }

          void pdl_approx_markvec_$ppsym($ctype *xx,int *res, int n, int l) {
        int i;
                res[l]=42;
        for (i=0;i<l;i++) {
                  res[i]=pdl_approx_cmpvec_$ppsym( &(xx[n*i]), &(xx[n*(i+1)]), 
n );
                }
      }

      void pdl_approx_qsortvec_ind_$ppsym($ctype *xx, int *ix, int n, int a, 
int b) {
                
        int i,j, median_ind;

        $ctype t;
        i = a; 
        j = b;

        median_ind = (i+j)/2;

        do {
          while( pdl_approx_cmpvec_$ppsym( &(xx[n*ix[i]]), 
&(xx[n*ix[median_ind]]), n )  <  0 ) 
                i++;
          while( pdl_approx_cmpvec_$ppsym( &(xx[n*ix[j]]), 
&(xx[n*ix[median_ind]]), n )  >  0 ) 
                j--;
          if(i<=j) {
                int k;
                k = ix[i];
                ix[i] = ix[j];
                ix[j] = k;              

                if (median_ind==i)
                  median_ind=j;
                else if (median_ind==j)
                  median_ind=i;

                i++; 
                j--;
          }
        } while (i <= j);
        
        if (a < j)
          pdl_qsortvec_ind_$ppsym( xx, ix, n, a, j );
        if (i < b)
          pdl_qsortvec_ind_$ppsym( xx, ix, n, i, b );
        
      }

FOO
   );
}

# when copying the data over to the temporary array,
# ignore the bad values and then only send the number
# of good elements to the sort routines
#

sub generic_approx_qsortvec {
    my $pdl = shift;
    my $ndim = shift;
    return 
'$TBSULQFD(pdl_approx_qsortvec_B,pdl_approx_qsortvec_S,pdl_approx_qsortvec_U,
             
pdl_approx_qsortvec_L,pdl_approx_qsortvec_Q,pdl_approx_qsortvec_F,pdl_approx_qsortvec_D)
 ($P(' . $pdl . '), '. $ndim.', 0, nn);';
}

sub generic_approx_bsortvec {
    my $pdl = shift;
    my $ndim = shift;
    return 
'$TBSULQFD(pdl_approx_bsortvec_B,pdl_approx_bsortvec_S,pdl_approx_bsortvec_U,
             
pdl_approx_bsortvec_L,pdl_approx_bsortvec_Q,pdl_approx_bsortvec_F,pdl_approx_bsortvec_D)
 ($P(' . $pdl . '), '. $ndim.', nn);';
}

sub generic_approx_isortvec {
    my $src = shift;
        my $dst = shift;
    my $ndim = shift;
    return 
'$TBSULQFD(pdl_approx_isortvec_B,pdl_approx_isortvec_S,pdl_approx_isortvec_U,
             
pdl_approx_isortvec_L,pdl_approx_isortvec_Q,pdl_approx_isortvec_F,pdl_approx_isortvec_D)
 ($P(' . $src . '), $P(' . $dst . '), ' . $ndim.', nn);';
}

sub generic_approx_markvec {
    my $pdl = shift;
        my $res = shift;
    my $ndim = shift;
    return 
'$TBSULQFD(pdl_approx_markvec_B,pdl_approx_markvec_S,pdl_approx_markvec_U,
             
pdl_approx_markvec_L,pdl_approx_markvec_Q,pdl_approx_markvec_F,pdl_approx_markvec_D)
 ($P(' . $pdl . '), $P(' . $res . '), '. $ndim.', nn);';
}

sub generic_approx_cmpvec {
    my $pdl1 = shift;
        my $pdl2 = shift;
    return 
'$TBSULQFD(pdl_approx_cmpvec_B,pdl_approx_cmpvec_S,pdl_approx_cmpvec_U,
             
pdl_approx_cmpvec_L,pdl_approx_cmpvec_Q,pdl_approx_cmpvec_F,pdl_approx_cmpvec_D)
 ($P(' . $pdl1 . '), $P(' . $pdl2 . '),  nd);';
}

# move all bad values to the end of the array
#
pp_def(
    'approx_qsortvec',
    HandleBad => 1,
    Pars => 'a(n,m);epsilon(); [o]b(n,m);',
    Code => 
    'int nn;
     int nd;
         approx_epsilon_$GENERIC() = $epsilon();
     loop(n,m) %{ $b() = $a(); %}
     nn = ($COMP(__m_size))-1;
     nd = $COMP(__n_size);
    ' . generic_approx_qsortvec('b','nd'),
    Doc => '
=for ref

Sort a list of vectors lexicographically. Comparison operations include an
epsilon.

Vector a is only counted as greater/less than vector b if 2 components differ
by more than epsilon.

The 0th dimension of the source piddle is dimension in the vector;
the 1st dimension is list order.  Higher dimensions are threaded over.

=for example

 print 
pdl([[1.0000000,3],[1.0000001,2],[1.0000002,1]])->approx_qsortvec(0.00001)

 [
  [ 1.0000002          1]
  [ 1.0000001          2]
  [         1          3]
 ]


=cut

',
    BadDoc =>
'
Vectors with bad components should be moved to the end of the array:
',
    ); # pp_def qsort

pp_def(
    'approx_bsortvec',
    HandleBad => 1,
    Pars => 'a(n,m);epsilon(); [o]b(n,m);',
    Code => 
    'int nn;
     int nd;
         approx_epsilon_$GENERIC() = $epsilon();
     loop(n,m) %{ $b() = $a(); %}
     nn = ($COMP(__m_size))-1;
     nd = $COMP(__n_size);
    ' . generic_approx_bsortvec('b','nd')
);

pp_def(
    'approx_isortvec',
    HandleBad => 1,
    Pars => 'a(n,m);epsilon(); [o]b(n,m);',
    Code => 
    'int nn;
     int nd;
         approx_epsilon_$GENERIC() = $epsilon();
     nn = ($COMP(__m_size))-1;
     nd = $COMP(__n_size);
    ' . generic_approx_isortvec('a', 'b', 'nd')
);

pp_def(
    'approx_markvec',
    HandleBad => 1,
    Pars => 'a(n,m);epsilon(); int [o]res(m);',
    Code => 
    'int nn;
     int nd;
         approx_epsilon_$GENERIC() = $epsilon();
     nn = ($COMP(__m_size))-1;
     nd = $COMP(__n_size);
    ' . generic_approx_markvec('a','res','nd')
);

pp_def(
    'approx_cmpvec',
    HandleBad => 1,
    Pars => 'a(n);b(n);epsilon(); int [o]res();',
    Code => 
    'int nd;
         approx_epsilon_$GENERIC() = $epsilon();
     nd = $COMP(__n_size);
     $res() = ' . generic_approx_cmpvec('a','b','nd')
);

sub generic_approx_qsortvec_ind {
    my $pdl = shift;
    my $ndim = shift;
    return 
'$TBSULQFD(pdl_approx_qsortvec_ind_B,pdl_approx_qsortvec_ind_S,pdl_approx_qsortvec_ind_U,
             
pdl_approx_qsortvec_ind_L,pdl_approx_qsortvec_ind_Q,pdl_approx_qsortvec_ind_F,pdl_approx_qsortvec_ind_D)
 ($P(' . $pdl . '), $P(indx), '. $ndim.', 0, nn);';
}

pp_def(
    'approx_qsortveci',
    HandleBad => 1,
    Pars => 'a(n,m); epsilon(); int [o]indx(m);',
    Code => 
    'int nd;
     int nn=$COMP(__m_size)-1;
         approx_epsilon_$GENERIC() = $epsilon();
     loop(m) %{
        $indx()=m;
     %}
     nd = $COMP(__n_size);
    ' . generic_approx_qsortvec_ind('a','nd'),
    Doc => '
=for ref

Sort a list of vectors lexicographically, returning the indices of the
sorted vectors rather than the sorted list itself. Comparison operations
include an epsilon.

Vector a is only counted as greater/less than vector b if 2 components differ
by more than epsilon.


As with C<approx_qsortvec>, the input PDL should be an NxM array containing M
separate N-dimensional vectors.  The return value is an integer M-PDL 
containing the M-indices of original array rows, in sorted order.

As with C<approx_qsortvec>, the zeroth element of the vectors runs slowest in 
the
sorted list.  

Additional dimensions are threaded over: each plane is sorted separately,
so approx_qsortveci may be thought of as a collapse operator of sorts (groan).

=cut

',
    BadDoc =>
'
Vectors with bad components should be moved to the end of the array:
',
    ); 

pp_add_exported ('', 'approx_uniq');
pp_addpm(<<'EOPM');

*approx_uniq = \&PDL::approx_uniq;
# return unique elements of array
# find as jumps in the sorted array
# flattens in the process
sub PDL::approx_uniq {
   use PDL::Core 'barf';
   my ($arr,$epsilon) = @_;
   return $arr if($arr->nelem == 0); # The null list is unique (CED)
   my $srt  = $arr->clump(-1)->where($arr==$arr)->qsort();  # no NaNs or BADs 
for qsort
   my $nans = $arr->clump(-1)->where($arr!=$arr);
   my $uniq = ($srt->nelem > 0) ? $srt->where(abs($srt-$srt->rotate(-1)) > 
$epsilon) : $srt;
   # make sure we return something if there is only one value
   my $answ = $nans;  # NaN values always uniq
   if ( $uniq->nelem > 0 ) {
      $answ = $uniq->append($answ);
   } else {
      $answ = ( ($srt->nelem == 0) ?  $srt : pdl( [$srt->index(0)] ) 
)->append($answ);
   }
   return $answ;
}

EOPM

pp_add_exported ('', 'approx_uniqind');
pp_addpm(<<'EOPM');

*approx_uniqind = \&PDL::approx_uniqind;
# return unique elements of array
# find as jumps in the sorted array
# flattens in the process
sub PDL::approx_uniqind {
  use PDL::Core 'barf';
  my ($arr,$epsilon) = @_;
  return $arr if($arr->nelem == 0); # The null list is unique (CED)
  # Different from uniq we sort and store the result in an intermediary
  my $aflat = $arr->flat;
  my $nanind = which($aflat!=$aflat);                        # NaN indexes
  my $good = $aflat->sequence->long->where($aflat==$aflat);  # good indexes
  my $i_srt = $aflat->where($aflat==$aflat)->qsorti;         # no BAD or NaN 
values for qsorti
  my $srt = $aflat->where($aflat==$aflat)->index($i_srt);
  my $uniqind;
  if ($srt->nelem > 0) {
     $uniqind = which(abs($srt-$srt->rotate(-1)) > $epsilon);
     $uniqind = $i_srt->slice('0') if $uniqind->isempty;
  } else {
     $uniqind = which($srt);
  }
  # Now map back to the original space
  my $ansind = $nanind;
  if ( $uniqind->nelem > 0 ) {
     $ansind = ($good->index($i_srt->index($uniqind)))->append($ansind);   
  } else {
     $ansind = $uniqind->append($ansind);
  }
  return $ansind;
}

EOPM

pp_add_exported ('', 'approx_uniqvec');
pp_addpm(<<'EOPM');

sub PDL::approx_uniqvec {

   my($pdl,$epsilon) = @_;

   return $pdl if ( $pdl->nelem == 0 || $pdl->ndims < 2 );
   return $pdl if ( $pdl->slice("(0)")->nelem < 2 );                     # 
slice isn't cheap but uniqvec isn't either

   my $pdl2d = null;
   $pdl2d = $pdl->mv(0,-1)->clump($pdl->ndims-1)->mv(-1,0);              # 
clump all but dim(0)

   my $ngood = null;
   $ngood = $pdl2d->ones->sumover;
   $ngood = $pdl2d->ngoodover if  ($PDL::Bad::Status && $pdl->badflag);  # 
number of good values each vector
   my $ngood2 = null;
   $ngood2 = $ngood->where($ngood);                                      # 
number of good values with no all-BADs

   $pdl2d = $pdl2d->mv(0,-1)->dice($ngood->which)->mv(-1,0);             # 
remove all-BAD vectors


   my $numnan = null;
   $numnan = ($pdl2d!=$pdl2d)->sumover;                                  # 
works since no all-BADs to confuse

   my $presrt = null;
   $presrt = $pdl2d->mv(0,-1)->dice($numnan->not->which)->mv(0,-1);      # 
remove vectors with any NaN values
   my $nanvec = null;
   $nanvec = $pdl2d->mv(0,-1)->dice($numnan->which)->mv(0,-1);           # the 
vectors with any NaN values

   # use dice instead of nslice since qsortvec might be packing
   # the badvals to the front of the array instead of the end like
   # the docs say. If that is the case and it gets fixed, it won't
   # bust uniqvec. DAL 14-March 2006

   my $srt = null;
   $srt = $presrt->approx_qsortvec($epsilon)->mv(0,-1);                         
          # BADs are sorted by qsortvec
   my $srtdice = $srt;
   my $somebad = null;
   if  ($PDL::Bad::Status && $srt->badflag) {
      $srtdice = $srt->dice($srt->mv(0,-1)->nbadover->not->which);
      $somebad = $srt->dice($srt->mv(0,-1)->nbadover->which);
   }

   my $uniq = null;
   if ($srtdice->nelem > 0) {
      $uniq = (abs($srtdice-$srtdice->rotate(-1)) > 
$epsilon)->mv(0,-1)->orover->which;
   } else {
      $uniq = $srtdice->orover->which;
   }

   my $ans = null;
   if ( $uniq->nelem > 0 ) {
      $ans = $srtdice->dice($uniq);
   } else {
      $ans = ($srtdice->nelem > 0) ? $srtdice->slice("0,:") : $srtdice;
   }
   return $ans->append($somebad)->append($nanvec->mv(0,-1))->mv(0,-1);
}

EOPM

pp_addpm({At=>'Bot'},<<'EOD');

=head1 AUTHOR

Copyright (C) Tuomas J. Lukka 1997 ([email protected]).
Contributions by Christian Soeller ([email protected]),
Karl Glazebrook ([email protected]) and 
Henning Glawe ([email protected]).  All rights
reserved. There is no warranty. You are allowed to redistribute this
software / documentation under certain conditions. For details, see
the file COPYING in the PDL distribution. If this file is separated
from the PDL distribution, the copyright notice should be included in
the file.

=cut

EOD

pp_done();

_______________________________________________
Perldl mailing list
[email protected]
http://mailman.jach.hawaii.edu/mailman/listinfo/perldl

Re: [Perldl] UniqueND

Reply via email to