hey folks,

I just got done writing a function to help with my data cleaning. It works -
actually a lot better than I thought it would - but I am sure there are ways
I could optimize this somewhat. The function takes two result sets (usually
first name, last name, city, state, zip) from two tables, then calculates
the similar_text() value for each pair (first name from table one, versus
first name from table two, and so forth) and places those values into a new
table. What I end up with is a sort of fuzzy matching lookup table, so I
could look up all values where there is a 90 percent match in each category
(for example), and then use that to update my original tables.

Hope that makes sense. Anyway, here's the code. If anyone has any
suggestions (such as, is it faster to dump into a local file and then move
that into a table?), I'm all ears. Thanks in advance:


<?php

function fuzzy_match($result_base, $result_compare)  {
     set_time_limit(400);

     //get the number of fields in each result
     $cols_base = mysql_num_fields($result_base);
     $cols_compare = mysql_num_fields($result_compare);

    //read field names into an array for both result sets
     for ($i=0; $i<$cols_base; $i++)
          {
           $base_arr[$i]=mysql_field_name($result_base, $i);
          }

     for ($i=0; $i<$cols_compare; $i++)
          {
           $compare_arr[$i]=mysql_field_name($result_compare, $i);
          }

 //delete the old table
     $query="drop table out_fuzzy";

 //if query fails, table does not exist so move on
 if(!mysql_query($query))
                          {
 }

 //create the SQL for new outfile
 $table_query1="create table out_fuzzy (id int(11) not null auto_increment,
";
 $table_query2=implode('_base varchar(50), ',$base_arr) . "_base
varchar(50), ";
 $table_query3=implode('_compare varchar(50), ',$compare_arr) . "_compare
varchar(50), ";
 $table_query4=implode('_pct double(6,1), ',$base_arr) . "_pct double(6,1),
primary key (id))  type=MyISAM";
 $qtable=$table_query1 . $table_query2 . $table_query3 . $table_query4;

 //create the new outfile
 mysql_query($qtable);


 while ($base = mysql_fetch_array($result_base))
  {
          mysql_data_seek($result_compare, 0);


              while ($compare = mysql_fetch_array($result_compare))
                       {

                                   $insert.="insert out_fuzzy set ";

                                           for ($i=0; $i<$cols_compare;
$i++)
                                           {


$insert.=mysql_field_name($result_compare, $i);
                                            $insert.="_compare=\"";
                                            $insert.=$compare[$i];
                                            $insert.="\", ";

$insert.=mysql_field_name($result_base, $i);
                                            $insert.="_base=\"";
                                            $insert.=$base[$i];
                                            $insert.="\", ";

$insert.=mysql_field_name($result_base, $i);
                                            $insert.="_pct=\"";

$base_tmp=trim(strtoupper($base[$i]));

$compare_tmp=trim(strtoupper($compare[$i]));
                                            $number=similar_text($base_tmp,
$compare_tmp, $last_sim);

$insert.=number_format($last_sim,1);
                                            $insert.="\", ";
                                             }
                         $insert=substr($insert, 0, -2);
                         mysql_query($insert) or die(mysql_error());

                          $insert="";




              }
 }




}

?>





-- 
PHP Database Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to