This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/master by this push:
       new  6b45621   TIKA-2318 fix exception/common count comparisons to 
include both mime_type_a and mime_type_b
6b45621 is described below

commit 6b45621d02f809493521cfb3654aaf6511dfe793
Author: tballison <[email protected]>
AuthorDate: Thu Apr 6 13:55:03 2017 -0400

    TIKA-2318 fix exception/common count comparisons to include both
    mime_type_a and mime_type_b
---
 .../src/main/resources/comparison-reports.xml      | 214 ++++++++++++++++++---
 1 file changed, 188 insertions(+), 26 deletions(-)

diff --git a/tika-eval/src/main/resources/comparison-reports.xml 
b/tika-eval/src/main/resources/comparison-reports.xml
index b980d69..e59d474 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -45,59 +45,180 @@
             having cnt &gt; 1
             order by cnt desc
         </sql>
+        <!-- build mime indexes -->
+
         <sql>create index if not exists pa_m_idx
             on profiles_a (mime_type_id);
         </sql>
+
         <sql>
             create index if not exists pb_m_idx
             on profiles_b (mime_type_id);
         </sql>
 
+        <!-- build exceptions comparison table -->
         <sql>drop table if exists exceptions_compared</sql>
         <sql>
             create table exceptions_compared
-            (mime_type_id integer primary key,
-            exceptions_a integer,
-            total_a integer,
-            percent_exceptions_a double,
-            exceptions_b integer,
-            total_b integer,
-            percent_exceptions_b double)
+            (mime_type_id_a integer, mime_type_id_b integer,
+            exceptions_a integer default 0,
+            total_a integer default 0,
+            percent_exceptions_a double default 0.0,
+            exceptions_b integer default 0,
+            total_b integer default 0,
+            percent_exceptions_b double default 0.0);
         </sql>
         <sql>
-            insert into exceptions_compared (mime_type_id)
-            select mime_type_id from mimes;
+
+            insert into exceptions_compared (mime_type_id_a, mime_type_id_b)
+            select ma.mime_type_id, mb.mime_type_id
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_type_id=a.mime_type_id
+            join mimes mb on mb.mime_type_id=b.mime_type_id
+            group by ma.mime_type_id, mb.mime_type_id
         </sql>
 
         <sql>
             update exceptions_compared ec set total_a=(
-            select count(1) as cnt from profiles_a
-            where profiles_a.mime_type_id= ec.mime_type_id
-            group by mime_type_id
-            )
+            select count(1) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            where pa.mime_type_id= ec.mime_type_id_a
+            and pb.mime_type_id=ec.mime_type_id_b
+            group by pa.mime_type_id, pb.mime_type_id);
         </sql>
         <sql>
             update exceptions_compared ec set total_b=(
-            select count(1) as cnt from profiles_b
-            where profiles_b.mime_type_id= ec.mime_type_id
-            group by mime_type_id
-            )
+            select count(1) as cnt from profiles_b pb
+            join profiles_a pa on pa.id=pb.id
+            where pa.mime_type_id= ec.mime_type_id_a
+            and pb.mime_type_id=ec.mime_type_id_b
+            group by pb.mime_type_id, pa.mime_type_id);
         </sql>
         <sql>
-            update exceptions_compared ec set exceptions_a=( select count(1) as
-            cnt from exceptions_a ea
+            update exceptions_compared ec set exceptions_a=
+            ( select count(1) as cnt from exceptions_a ea
             join profiles_a pa on ea.id=pa.id
-            where pa.mime_type_id= ec.mime_type_id
+            join profiles_b pb on pa.id=pb.id
+            where pa.mime_type_id= ec.mime_type_id_a
+            and pb.mime_type_id=ec.mime_type_id_b
             and parse_exception_type_id=0
-            group by mime_type_id )
+            group by pa.mime_type_id, pb.mime_type_id);
         </sql>
         <sql>
-            update exceptions_compared ec set exceptions_b=(
-            select count(1) as cnt from exceptions_b eb
-            join profiles_b pb on eb.id=pb.id
-            where pb.mime_type_id= ec.mime_type_id
+            update exceptions_compared ec set exceptions_b=
+            ( select count(1) as cnt from exceptions_b eb
+            join profiles_b pb on eb.id=pa.id
+            join profiles_a pa on pa.id=pb.id
+            where pa.mime_type_id= ec.mime_type_id_a
+            and pb.mime_type_id=ec.mime_type_id_b
             and parse_exception_type_id=0
-            group by mime_type_id )
+            group by pb.mime_type_id, pa.mime_type_id);
+        </sql>
+
+        <sql>
+            update exceptions_compared
+            set percent_exceptions_a =
+            (cast (exceptions_a as decimal))/(cast (total_a as decimal))
+            where total_a &gt; 0
+        </sql>
+        <sql>
+            update exceptions_compared
+            set percent_exceptions_b =
+            (cast (exceptions_b as decimal))/(cast (total_b as decimal))
+            where total_b &gt; 0
+        </sql>
+
+        <!-- build tmp common words table -->
+        <!-- build exceptions comparison table -->
+        <sql>drop table if exists token_counts_compared</sql>
+        <sql>
+            create table token_counts_compared
+            (mime_type_id_a integer,
+            mime_type_id_b integer,
+            num_tokens_a integer default 0,
+            num_tokens_b integer default 0,
+            num_alphabetic_tokens_a integer default 0,
+            num_alphabetic_tokens_b integer default 0,
+            num_common_tokens_a integer default 0,
+            num_common_tokens_b integer default 0,
+            );
+        </sql>
+        <sql>
+            insert into token_counts_compared (mime_type_id_a, mime_type_id_b)
+            select ma.mime_type_id, mb.mime_type_id
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_type_id=a.mime_type_id
+            join mimes mb on mb.mime_type_id=b.mime_type_id
+            group by ma.mime_type_id, mb.mime_type_id
+
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_tokens_a=(
+            select sum(num_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_tokens_b=(
+            select sum(num_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pa.id=pb.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_alphabetic_tokens_a=(
+            select sum(num_alphabetic_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_alphabetic_tokens_b=(
+            select sum(num_alphabetic_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pb.id=pa.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_common_tokens_a=(
+            select sum(num_common_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_common_tokens_b=(
+            select sum(num_common_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pa.id=pb.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
+            );
         </sql>
 
     </before>
@@ -224,6 +345,8 @@
         </sql>
     </report>
 
+
+    <!-- Exceptions -->
     <report reportName="AllExceptionsByMimeA"
             reportFilename="exceptions/exceptions_by_mime_A.xlsx"
             format="xlsx"
@@ -600,6 +723,8 @@
             ca.num_common_tokens as NUM_COMMON_TOKENS_A,
             cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
             cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+            ifnull(cb.num_common_tokens,0)-
+            ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
             ca.top_n_tokens as TOP_N_TOKENS_A,
             cb.top_n_tokens as TOP_N_TOKENS_B,
             ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
@@ -643,6 +768,8 @@
             ca.num_common_tokens as NUM_COMMON_TOKENS_A,
             cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
             cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+            ifnull(cb.num_common_tokens,0)-
+            ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
             ca.top_n_tokens as TOP_N_TOKENS_A,
             cb.top_n_tokens as TOP_N_TOKENS_B,
             ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
@@ -667,6 +794,41 @@
             limit 100000
         </sql>
     </report>
+
+    <report reportName="CommonTokenComparisonsByMimeType"
+            reportFilename="content/common_token_comparisons_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select ma.mime_string, mb.mime_string,
+            num_tokens_a, num_tokens_b
+            num_alphabetic_tokens_a, num_alphabetic_tokens_b,
+            num_common_tokens_a, num_common_tokens_b,
+            ifnull(num_common_tokens_b, 0)-ifnull(num_common_tokens_a, 0) as 
change_in_common_tokens_b
+            from token_counts_compared tcc
+            join mimes ma on tcc.mime_type_id_a = ma.mime_type_id
+            join mimes mb on tcc.mime_type_id_b = mb.mime_type_id
+            order by change_in_common_tokens_b desc
+        </sql>
+    </report>
+
+
+    <report reportName="ExceptionComparisonsByMimeType"
+            reportFilename="exceptions/exceptions_compared_by_mime_type.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select ma.mime_string, mb.mime_string, exceptions_a,
+            total_a, percent_exceptions_a,
+            exceptions_b, total_b, percent_exceptions_b
+            from exceptions_compared c
+            join mimes ma on ma.mime_type_id=c.mime_type_id_a
+            join mimes mb on mb.mime_type_id=c.mime_type_id_b
+            order by percent_exceptions_b desc, total_b desc;
+        </sql>
+    </report>
     <!--    <report reportName="MD5 Duplicate Counts A"
                 reportFilename="md5/md5_duplicate_counts_A.xlsx"
                 format="xlsx"

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to