This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 6b45621 TIKA-2318 fix exception/common count comparisons to
include both mime_type_a and mime_type_b
6b45621 is described below
commit 6b45621d02f809493521cfb3654aaf6511dfe793
Author: tballison <[email protected]>
AuthorDate: Thu Apr 6 13:55:03 2017 -0400
TIKA-2318 fix exception/common count comparisons to include both
mime_type_a and mime_type_b
---
.../src/main/resources/comparison-reports.xml | 214 ++++++++++++++++++---
1 file changed, 188 insertions(+), 26 deletions(-)
diff --git a/tika-eval/src/main/resources/comparison-reports.xml
b/tika-eval/src/main/resources/comparison-reports.xml
index b980d69..e59d474 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -45,59 +45,180 @@
having cnt > 1
order by cnt desc
</sql>
+ <!-- build mime indexes -->
+
<sql>create index if not exists pa_m_idx
on profiles_a (mime_type_id);
</sql>
+
<sql>
create index if not exists pb_m_idx
on profiles_b (mime_type_id);
</sql>
+ <!-- build exceptions comparison table -->
<sql>drop table if exists exceptions_compared</sql>
<sql>
create table exceptions_compared
- (mime_type_id integer primary key,
- exceptions_a integer,
- total_a integer,
- percent_exceptions_a double,
- exceptions_b integer,
- total_b integer,
- percent_exceptions_b double)
+ (mime_type_id_a integer, mime_type_id_b integer,
+ exceptions_a integer default 0,
+ total_a integer default 0,
+ percent_exceptions_a double default 0.0,
+ exceptions_b integer default 0,
+ total_b integer default 0,
+ percent_exceptions_b double default 0.0);
</sql>
<sql>
- insert into exceptions_compared (mime_type_id)
- select mime_type_id from mimes;
+
+ insert into exceptions_compared (mime_type_id_a, mime_type_id_b)
+ select ma.mime_type_id, mb.mime_type_id
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_type_id=a.mime_type_id
+ join mimes mb on mb.mime_type_id=b.mime_type_id
+ group by ma.mime_type_id, mb.mime_type_id
</sql>
<sql>
update exceptions_compared ec set total_a=(
- select count(1) as cnt from profiles_a
- where profiles_a.mime_type_id= ec.mime_type_id
- group by mime_type_id
- )
+ select count(1) as cnt from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ where pa.mime_type_id= ec.mime_type_id_a
+ and pb.mime_type_id=ec.mime_type_id_b
+ group by pa.mime_type_id, pb.mime_type_id);
</sql>
<sql>
update exceptions_compared ec set total_b=(
- select count(1) as cnt from profiles_b
- where profiles_b.mime_type_id= ec.mime_type_id
- group by mime_type_id
- )
+ select count(1) as cnt from profiles_b pb
+ join profiles_a pa on pa.id=pb.id
+ where pa.mime_type_id= ec.mime_type_id_a
+ and pb.mime_type_id=ec.mime_type_id_b
+ group by pb.mime_type_id, pa.mime_type_id);
</sql>
<sql>
- update exceptions_compared ec set exceptions_a=( select count(1) as
- cnt from exceptions_a ea
+ update exceptions_compared ec set exceptions_a=
+ ( select count(1) as cnt from exceptions_a ea
join profiles_a pa on ea.id=pa.id
- where pa.mime_type_id= ec.mime_type_id
+ join profiles_b pb on pa.id=pb.id
+ where pa.mime_type_id= ec.mime_type_id_a
+ and pb.mime_type_id=ec.mime_type_id_b
and parse_exception_type_id=0
- group by mime_type_id )
+ group by pa.mime_type_id, pb.mime_type_id);
</sql>
<sql>
- update exceptions_compared ec set exceptions_b=(
- select count(1) as cnt from exceptions_b eb
- join profiles_b pb on eb.id=pb.id
- where pb.mime_type_id= ec.mime_type_id
+ update exceptions_compared ec set exceptions_b=
+ ( select count(1) as cnt from exceptions_b eb
+ join profiles_b pb on eb.id=pa.id
+ join profiles_a pa on pa.id=pb.id
+ where pa.mime_type_id= ec.mime_type_id_a
+ and pb.mime_type_id=ec.mime_type_id_b
and parse_exception_type_id=0
- group by mime_type_id )
+ group by pb.mime_type_id, pa.mime_type_id);
+ </sql>
+
+ <sql>
+ update exceptions_compared
+ set percent_exceptions_a =
+ (cast (exceptions_a as decimal))/(cast (total_a as decimal))
+ where total_a > 0
+ </sql>
+ <sql>
+ update exceptions_compared
+ set percent_exceptions_b =
+ (cast (exceptions_b as decimal))/(cast (total_b as decimal))
+ where total_b > 0
+ </sql>
+
+ <!-- build tmp common words table -->
+ <!-- build exceptions comparison table -->
+ <sql>drop table if exists token_counts_compared</sql>
+ <sql>
+ create table token_counts_compared
+ (mime_type_id_a integer,
+ mime_type_id_b integer,
+ num_tokens_a integer default 0,
+ num_tokens_b integer default 0,
+ num_alphabetic_tokens_a integer default 0,
+ num_alphabetic_tokens_b integer default 0,
+ num_common_tokens_a integer default 0,
+ num_common_tokens_b integer default 0,
+ );
+ </sql>
+ <sql>
+ insert into token_counts_compared (mime_type_id_a, mime_type_id_b)
+ select ma.mime_type_id, mb.mime_type_id
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_type_id=a.mime_type_id
+ join mimes mb on mb.mime_type_id=b.mime_type_id
+ group by ma.mime_type_id, mb.mime_type_id
+
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_tokens_a=(
+ select sum(num_tokens) as cnt from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join contents_a c on c.id = pa.id
+ where pb.mime_type_id= tcc.mime_type_id_b
+ and pa.mime_type_id=tcc.mime_type_id_a
+ group by mime_type_id_a, mime_type_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_tokens_b=(
+ select sum(num_tokens) as cnt from profiles_b pb
+ join profiles_a pa on pa.id=pb.id
+ join contents_b c on c.id = pb.id
+ where pb.mime_type_id= tcc.mime_type_id_b
+ and pa.mime_type_id=tcc.mime_type_id_a
+ group by mime_type_id_a, mime_type_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_alphabetic_tokens_a=(
+ select sum(num_alphabetic_tokens) as cnt from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join contents_a c on c.id = pa.id
+ where pb.mime_type_id= tcc.mime_type_id_b
+ and pa.mime_type_id=tcc.mime_type_id_a
+ group by mime_type_id_a, mime_type_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_alphabetic_tokens_b=(
+ select sum(num_alphabetic_tokens) as cnt from profiles_b pb
+ join profiles_a pa on pb.id=pa.id
+ join contents_b c on c.id = pb.id
+ where pb.mime_type_id= tcc.mime_type_id_b
+ and pa.mime_type_id=tcc.mime_type_id_a
+ group by mime_type_id_a, mime_type_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_common_tokens_a=(
+ select sum(num_common_tokens) as cnt from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join contents_a c on c.id = pa.id
+ where pb.mime_type_id= tcc.mime_type_id_b
+ and pa.mime_type_id=tcc.mime_type_id_a
+ group by mime_type_id_a, mime_type_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_common_tokens_b=(
+ select sum(num_common_tokens) as cnt from profiles_b pb
+ join profiles_a pa on pa.id=pb.id
+ join contents_b c on c.id = pb.id
+ where pb.mime_type_id= tcc.mime_type_id_b
+ and pa.mime_type_id=tcc.mime_type_id_a
+ group by mime_type_id_a, mime_type_id_b
+ );
</sql>
</before>
@@ -224,6 +345,8 @@
</sql>
</report>
+
+ <!-- Exceptions -->
<report reportName="AllExceptionsByMimeA"
reportFilename="exceptions/exceptions_by_mime_A.xlsx"
format="xlsx"
@@ -600,6 +723,8 @@
ca.num_common_tokens as NUM_COMMON_TOKENS_A,
cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+ ifnull(cb.num_common_tokens,0)-
+ ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
ca.top_n_tokens as TOP_N_TOKENS_A,
cb.top_n_tokens as TOP_N_TOKENS_B,
ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
@@ -643,6 +768,8 @@
ca.num_common_tokens as NUM_COMMON_TOKENS_A,
cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+ ifnull(cb.num_common_tokens,0)-
+ ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
ca.top_n_tokens as TOP_N_TOKENS_A,
cb.top_n_tokens as TOP_N_TOKENS_B,
ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
@@ -667,6 +794,41 @@
limit 100000
</sql>
</report>
+
+ <report reportName="CommonTokenComparisonsByMimeType"
+ reportFilename="content/common_token_comparisons_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select ma.mime_string, mb.mime_string,
+ num_tokens_a, num_tokens_b
+ num_alphabetic_tokens_a, num_alphabetic_tokens_b,
+ num_common_tokens_a, num_common_tokens_b,
+ ifnull(num_common_tokens_b, 0)-ifnull(num_common_tokens_a, 0) as
change_in_common_tokens_b
+ from token_counts_compared tcc
+ join mimes ma on tcc.mime_type_id_a = ma.mime_type_id
+ join mimes mb on tcc.mime_type_id_b = mb.mime_type_id
+ order by change_in_common_tokens_b desc
+ </sql>
+ </report>
+
+
+ <report reportName="ExceptionComparisonsByMimeType"
+ reportFilename="exceptions/exceptions_compared_by_mime_type.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select ma.mime_string, mb.mime_string, exceptions_a,
+ total_a, percent_exceptions_a,
+ exceptions_b, total_b, percent_exceptions_b
+ from exceptions_compared c
+ join mimes ma on ma.mime_type_id=c.mime_type_id_a
+ join mimes mb on mb.mime_type_id=c.mime_type_id_b
+ order by percent_exceptions_b desc, total_b desc;
+ </sql>
+ </report>
<!-- <report reportName="MD5 Duplicate Counts A"
reportFilename="md5/md5_duplicate_counts_A.xlsx"
format="xlsx"
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].