All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ka.tika-eval.1.18.source-code.comparison-reports.xml Maven / Gradle / Ivy

<?xml version="1.0" encoding="UTF-8" standalone="no" ?>

<!--
  Licensed to the Apache Software Foundation (ASF) under one
  or more contributor license agreements.  See the NOTICE file
  distributed with this work for additional information
  regarding copyright ownership.  The ASF licenses this file
  to you under the Apache License, Version 2.0 (the
  "License"); you may not use this file except in compliance
  with the License.  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing,
  software distributed under the License is distributed on an
  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  KIND, either express or implied.  See the License for the
  specific language governing permissions and limitations
  under the License.
-->

<reports>


    <before>

        <sql>drop table if exists md5_multiples_tmp_a</sql>
        <sql>create table md5_multiples_tmp_a (MD5 char(32), cnt int)
            as
            select md5, count(1) cnt
            from profiles_a
            where md5 is not null
            group by md5
            having cnt &gt; 1
            order by cnt desc
        </sql>

        <sql>drop table if exists md5_multiples_tmp_b</sql>
        <sql>create table md5_multiples_tmp_b (MD5 char(32), cnt int)
            as
            select md5, count(1) cnt
            from profiles_b
            where md5 is not null
            group by md5
            having cnt &gt; 1
            order by cnt desc
        </sql>
        <!-- build mime indexes -->

        <sql>create index if not exists pa_m_idx
            on profiles_a (mime_id);
        </sql>

        <sql>
            create index if not exists pb_m_idx
            on profiles_b (mime_id);
        </sql>

        <!-- build exceptions comparison table -->
        <sql>drop table if exists exceptions_compared</sql>
        <sql>
            create table exceptions_compared
            (mime_id_a integer, mime_id_b integer,
            exceptions_a integer default 0,
            total_a integer default 0,
            percent_exceptions_a double default 0.0,
            exceptions_b integer default 0,
            total_b integer default 0,
            percent_exceptions_b double default 0.0);
        </sql>
        <sql>
            insert into exceptions_compared (mime_id_a, mime_id_b)
            select ma.mime_id, mb.mime_id
            from profiles_a a
            join profiles_b b on a.id=b.id
            join mimes ma on ma.mime_id=a.mime_id
            join mimes mb on mb.mime_id=b.mime_id
            group by ma.mime_id, mb.mime_id
        </sql>

        <sql>
            update exceptions_compared ec set total_a=(
            select count(1) as cnt from profiles_a pa
            join profiles_b pb on pa.id=pb.id
            where pa.mime_id= ec.mime_id_a
            and pb.mime_id=ec.mime_id_b
            group by pa.mime_id, pb.mime_id);
        </sql>
        <sql>
            update exceptions_compared ec set total_b=(
            select count(1) as cnt from profiles_b pb
            join profiles_a pa on pa.id=pb.id
            where pa.mime_id= ec.mime_id_a
            and pb.mime_id=ec.mime_id_b
            group by pb.mime_id, pa.mime_id);
        </sql>
        <sql>
            update exceptions_compared ec set exceptions_a=
            ( select count(1) as cnt from exceptions_a ea
            join profiles_a pa on pa.id=ea.id
            join profiles_b pb on pb.id=pa.id
            where pa.mime_id= ec.mime_id_a
            and pb.mime_id=ec.mime_id_b
            and parse_exception_id=0
            group by pa.mime_id, pb.mime_id);
        </sql>
        <sql>
            update exceptions_compared ec set exceptions_b=
            ( select count(1) as cnt from exceptions_b eb
            join profiles_b pb on pb.id=eb.id
            join profiles_a pa on pa.id=pb.id
            where pa.mime_id= ec.mime_id_a
            and pb.mime_id=ec.mime_id_b
            and parse_exception_id=0
            group by pb.mime_id, pa.mime_id);
        </sql>

        <sql>
            update exceptions_compared
            set percent_exceptions_a =
            (cast (exceptions_a as decimal))/(cast (total_a as decimal))
            where total_a &gt; 0
        </sql>
        <sql>
            update exceptions_compared
            set percent_exceptions_b =
            (cast (exceptions_b as decimal))/(cast (total_b as decimal))
            where total_b &gt; 0
        </sql>

        <!-- build tmp common words table -->
        <sql>drop table if exists token_counts_compared</sql>
        <sql>
            create table token_counts_compared
            (mime_id_a integer,
            mime_id_b integer,
            num_tokens_a long default 0,
            num_tokens_b long default 0,
            num_alphabetic_tokens_a long default 0,
            num_alphabetic_tokens_b long default 0,
            num_common_tokens_a long default 0,
            num_common_tokens_b long default 0
            );
        </sql>
        <sql>
            insert into token_counts_compared (mime_id_a, mime_id_b)
            select ma.mime_id, mb.mime_id
            from profiles_a a
            join profiles_b b on a.id=b.id
            join mimes ma on ma.mime_id=a.mime_id
            join mimes mb on mb.mime_id=b.mime_id
            group by ma.mime_id, mb.mime_id

        </sql>

        <sql>
            update token_counts_compared tcc set num_tokens_a=(
            select sum(num_tokens) as cnt from profiles_a pa
            join profiles_b pb on pa.id=pb.id
            join contents_a c on c.id = pa.id
            where pb.mime_id= tcc.mime_id_b
            and pa.mime_id=tcc.mime_id_a
            group by mime_id_a, mime_id_b
            );
        </sql>

        <sql>
            update token_counts_compared tcc set num_tokens_b=(
            select sum(num_tokens) as cnt from profiles_b pb
            join profiles_a pa on pa.id=pb.id
            join contents_b c on c.id = pb.id
            where pb.mime_id= tcc.mime_id_b
            and pa.mime_id=tcc.mime_id_a
            group by mime_id_a, mime_id_b
            );
        </sql>

        <sql>
            update token_counts_compared tcc set num_alphabetic_tokens_a=(
            select sum(num_alphabetic_tokens) as cnt from profiles_a pa
            join profiles_b pb on pa.id=pb.id
            join contents_a c on c.id = pa.id
            where pb.mime_id= tcc.mime_id_b
            and pa.mime_id=tcc.mime_id_a
            group by mime_id_a, mime_id_b
            );
        </sql>

        <sql>
            update token_counts_compared tcc set num_alphabetic_tokens_b=(
            select sum(num_alphabetic_tokens) as cnt from profiles_b pb
            join profiles_a pa on pb.id=pa.id
            join contents_b c on c.id = pb.id
            where pb.mime_id= tcc.mime_id_b
            and pa.mime_id=tcc.mime_id_a
            group by mime_id_a, mime_id_b
            );
        </sql>

        <sql>
            update token_counts_compared tcc set num_common_tokens_a=(
            select sum(num_common_tokens) as cnt from profiles_a pa
            join profiles_b pb on pa.id=pb.id
            join contents_a c on c.id = pa.id
            where pb.mime_id= tcc.mime_id_b
            and pa.mime_id=tcc.mime_id_a
            group by mime_id_a, mime_id_b
            );
        </sql>

        <sql>
            update token_counts_compared tcc set num_common_tokens_b=(
            select sum(num_common_tokens) as cnt from profiles_b pb
            join profiles_a pa on pa.id=pb.id
            join contents_b c on c.id = pb.id
            where pb.mime_id= tcc.mime_id_b
            and pa.mime_id=tcc.mime_id_a
            group by mime_id_a, mime_id_b
            );
        </sql>

    </before>

    <!-- MIMES -->
    <report reportName="All Mimes In A"
            reportFilename="mimes/all_mimes_A.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string, count(1) cnt from
            profiles_a p
            join mimes m on m.mime_id = p.mime_id
            group by mime_string
            order by cnt desc
        </sql>
    </report>

    <report reportName="All Mimes In B"
            reportFilename="mimes/all_mimes_B.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string, count(1) cnt from
            profiles_b p
            join mimes m on m.mime_id = p.mime_id
            group by mime_string
            order by cnt desc
        </sql>
    </report>
    <report reportName="Container Mimes In A"
            reportFilename="mimes/container_mimes_A.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string, count(1) cnt from
            profiles_a p
            join mimes m on m.mime_id = p.mime_id
            where is_embedded=false
            group by mime_string
            order by cnt desc
        </sql>
    </report>

    <report reportName="Container Mimes In B"
            reportFilename="mimes/container_mimes_B.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string, count(1) cnt from
            profiles_b p
            join mimes m on m.mime_id = p.mime_id
            where is_embedded=false
            group by mime_string
            order by cnt desc
        </sql>
    </report>
    <report reportName="Embedded Mimes In A"
            reportFilename="mimes/embedded_mimes_A.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string, count(1) cnt from
            profiles_a p
            join mimes m on m.mime_id = p.mime_id
            where is_embedded=true
            group by mime_string
            order by cnt desc
        </sql>
    </report>

    <report reportName="Embedded Mimes In B"
            reportFilename="mimes/embedded_mimes_B.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string, count(1) cnt from
            profiles_b p
            join mimes m on m.mime_id = p.mime_id
            where is_embedded=true
            group by mime_string
            order by cnt desc
        </sql>
    </report>
    <report reportName="Mime Differences A -> B"
            reportFilename="mimes/mime_diffs_A_to_B.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select concat(ma.mime_string, ' -&gt; ', mb.mime_string) as
            MIME_A_TO_MIME_B, count(1) as COUNT
            from profiles_a a
            join profiles_b b on a.id=b.id
            join mimes ma on ma.mime_id=a.mime_id
            join mimes mb on mb.mime_id=b.mime_id
            where a.mime_id &lt;&gt; b.mime_id
            group by MIME_A_TO_MIME_B
            order by COUNT DESC
        </sql>
    </report>

    <report reportName="Mime Differences A -> B Details"
            reportFilename="mimes/mime_diffs_A_to_B_details.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select concat(ma.mime_string, ' -&gt; ', mb.mime_string) as
            MIME_A_TO_MIME_B,
            file_path,
            c.length as CONTAINER_LENGTH,
            a.file_name
            from profiles_a a
            join profiles_b b on a.id=b.id
            join mimes ma on ma.mime_id=a.mime_id
            join mimes mb on mb.mime_id=b.mime_id
            join containers c on a.container_id=c.container_id
            where a.mime_id &lt;&gt; b.mime_id
            order by MIME_A_TO_MIME_B
        </sql>
    </report>


    <!-- Exceptions -->
    <report reportName="AllExceptionsByMimeA"
            reportFilename="exceptions/exceptions_by_mime_A.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string, count(1) cnt from
            exceptions_a e
            join profiles_a p on p.id=e.id
            join mimes m on m.mime_id = p.mime_id
            group by mime_string
            order by cnt desc
        </sql>
    </report>
    <report reportName="AllExceptionsByMimeB"
            reportFilename="exceptions/exceptions_by_mime_B.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string, count(1) cnt from
            exceptions_b e
            join profiles_b p on p.id=e.id
            join mimes m on m.mime_id = p.mime_id
            group by mime_string
            order by cnt desc
        </sql>
    </report>

    <report reportName="ContainerExceptionsByMimeA"
            reportFilename="exceptions/container_exceptions_by_mime_A.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string, count(1) cnt from
            exceptions_a e
            join profiles_a p on p.id=e.id
            join mimes m on m.mime_id = p.mime_id
            where is_embedded=false
            and parse_exception_id=0
            group by mime_string
            order by cnt desc
        </sql>
    </report>

    <report reportName="ContainerExceptionsByMimeB"
            reportFilename="exceptions/container_exceptions_by_mime_B.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string, count(1) cnt from
            exceptions_b e
            join profiles_b p on p.id=e.id
            join mimes m on m.mime_id = p.mime_id
            where is_embedded=false
            and parse_exception_id=0
            group by mime_string
            order by cnt desc
        </sql>
    </report>
    <report reportName="AllExceptionsByMimeByTypeA"
            reportFilename="exceptions/exceptions_by_mime_by_type_A.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string as MIME_TYPE,
            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
            from exceptions_a e
            join profiles_a p on p.id=e.id
            join containers c on p.container_id=c.container_id
            join mimes m on m.mime_id=p.mime_id
            join ref_parse_exception_types r on
            r.parse_exception_id=e.parse_exception_id
            group by p.mime_id, parse_exception_description
            order by MIME_TYPE, EXCEPTION_TYPE
        </sql>
    </report>

    <report reportName="AllExceptionsByMimeByTypeB"
            reportFilename="exceptions/exceptions_by_mime_by_type_B.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string as MIME_TYPE,
            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
            from exceptions_b e
            join profiles_b p on p.id=e.id
            join containers c on p.container_id=c.container_id
            join mimes m on m.mime_id=p.mime_id
            join ref_parse_exception_types r on
            r.parse_exception_id=e.parse_exception_id
            group by p.mime_id, parse_exception_description
            order by MIME_TYPE, EXCEPTION_TYPE
        </sql>
    </report>

    <report reportName="TextLostFromACausedByNewExceptionsInB"
            reportFilename="exceptions/text_lost_from_A_caused_by_new_exceptions_in_B.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select file_path as FILE_PATH,
            c.length as CONTAINER_LENGTH,
            ca.NUM_TOKENS as NUM_TOKENS_A,
            cb.NUM_TOKENS as NUM_TOKENS_B,
            ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A, cb.NUM_UNIQUE_TOKENS
            as NUM_UNIQUE_TOKENS_B,
            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
            ca.top_n_tokens as TOP_N_TOKENS_A, cb.top_n_tokens as TOP_N_TOKENS_B,
            eb.ORIG_STACK_TRACE as ORIG_STACK_TRACE_B
            from contents_a ca
            join profiles_a pa on ca.id = pa.id
            join containers c on pa.container_id=c.container_id
            left join contents_b cb on ca.id=cb.id
            left join exceptions_b eb on ca.id = eb.id
            left join exceptions_a ea on ca.id = ea.id
            where eb.orig_stack_trace is not null
            and ea.orig_stack_trace is null
            order by ca.num_common_tokens - ifnull(cb.num_common_tokens,0) desc
        </sql>
    </report>

    <report reportName="FixedExceptionsInBByMimeType"
            reportFilename="exceptions/fixed_exceptions_in_B_by_mime.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string as MIME_TYPE, count(1) as COUNT
            from exceptions_a ea
            left join exceptions_b eb on ea.id = eb.id
            join profiles_a pa on pa.id=ea.id
            join profiles_b pb on pa.id=pb.id
            join containers c on pa.container_id=c.container_id
            join mimes m on m.mime_id=pa.mime_id
            where eb.id is null
            and ea.parse_exception_id=0
            group by mime_string
        </sql>
    </report>

    <report reportName="FixedExceptionsInByDetails"
            reportFilename="exceptions/fixed_exceptions_in_B_details.xlsx"
            format="xlsx"
            includeSql="true">
        <sql>
            select
            file_path,
            c.length as CONTAINER_LENGTH,
            mime_string as MIME_TYPE,
            pa.file_name, pa.is_embedded
            from exceptions_a ea
            left join exceptions_b eb on ea.id = eb.id
            join profiles_a pa on pa.id=ea.id
            join profiles_b pb on pb.id=pa.id //this ensures that files were actually processed in both runs
            join containers c on pa.container_id=c.container_id
            join mimes m on m.mime_id=pa.mime_id
            where eb.id is null
            and ea.parse_exception_id=0
            order by mime_string
        </sql>
    </report>
    <report reportName="ContentsOfFixedExceptionsInB"
            reportFilename="exceptions/contents_of_fixed_exceptions_in_B.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select file_path,
            c.length as CONTAINER_LENGTH,
            mime_string as MIME_TYPE,
            CONTENT_LENGTH,
            NUM_TOKENS, NUM_UNIQUE_TOKENS,
            TOP_N_TOKENS, LANG_ID_1,TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV
            from exceptions_a ea
            left join exceptions_b eb on ea.id = eb.id
            join profiles_a p on p.id=ea.id
            join contents_b cb on cb.id=ea.id
            join containers c on p.container_id=c.container_id
            join mimes m on m.mime_id=p.mime_id
            where eb.id is null
            and ea.parse_exception_id=0
        </sql>
    </report>

    <report reportName="NewExceptionsByMimeType"
            reportFilename="exceptions/new_exceptions_in_B_by_mime.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select mime_string as MIME_TYPE_A, count(1) as COUNT
            from exceptions_b eb
            left join exceptions_a ea on ea.id = eb.id
            join profiles_a pa on pa.id=eb.id
            join profiles_b pb on pb.id=pa.id
            join containers c on pa.container_id=c.container_id
            join mimes m on m.mime_id=pa.mime_id
            where ea.id is null
            and eb.parse_exception_id=0
            group by mime_string
            order by COUNT desc
        </sql>
    </report>

    <report reportName="NewExceptionsInBByMimeTypeByStackTrace"
            reportFilename="exceptions/new_exceptions_in_B_by_mime_by_stack_trace.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select MIME_STRING as MIME_TYPE, eb.sort_stack_trace, count(1) as
            COUNT
            from exceptions_b eb
            left join exceptions_a ea on ea.id = eb.id
            join profiles_a p on p.id=eb.id
            join mimes m on m.mime_id=p.mime_id
            where ea.id is null
            and eb.parse_exception_id=0
            group by MIME_TYPE, eb.sort_stack_trace
            order by MIME_TYPE asc, COUNT desc
        </sql>
    </report>

    <report reportName="NewExceptionsInBDetails"
            reportFilename="exceptions/new_exceptions_in_B_details.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select file_path,
            c.length as CONTAINER_LENGTH,
            mime_string as MIME_TYPE,
            eb.orig_stack_trace, eb.sort_stack_trace
            from exceptions_b eb
            left join exceptions_a ea on ea.id = eb.id
            join profiles_a p on p.id=eb.id
            join containers c on p.container_id=c.container_id
            join mimes m on m.mime_id=p.mime_id
            where ea.id is null
            and eb.parse_exception_id=0
            order by MIME_TYPE asc, eb.ORIG_STACK_TRACE
        </sql>
    </report>

    <report reportName="StackTracesByMimeInA"
            reportFilename="exceptions/stack_traces_by_mime_A.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
            COUNT
            from exceptions_a e
            join profiles_a p on p.id=e.id
            join mimes m on m.mime_id=p.mime_id
            and e.parse_exception_id=0
            group by MIME_TYPE, e.sort_stack_trace
            order by MIME_TYPE asc, COUNT desc
        </sql>
    </report>

    <report reportName="AllStackTracesInA"
            reportFilename="exceptions/stack_traces_A.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select file_path,
            c.length as CONTAINER_LENGTH,
            mime_string as MIME_TYPE,
            orig_stack_trace, sort_stack_trace
            from exceptions_a e
            join profiles_a p on p.id=e.id
            join containers c on p.container_id=c.container_id
            join mimes m on m.mime_id=p.mime_id
            and e.parse_exception_id=0
            order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
            CONTAINER_LENGTH asc
        </sql>
    </report>
    <report reportName="AllStackTracesInB"
            reportFilename="exceptions/stack_traces_B.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select file_path,
            c.length as CONTAINER_LENGTH,
            mime_string as MIME_TYPE,
            orig_stack_trace, sort_stack_trace
            from exceptions_b e
            join profiles_b p on p.id=e.id
            join containers c on p.container_id=c.container_id
            join mimes m on m.mime_id=p.mime_id
            and e.parse_exception_id=0
            order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
            CONTAINER_LENGTH asc
        </sql>
    </report>

    <report reportName="StackTracesByMimeInB"
            reportFilename="exceptions/stack_traces_by_mime_B.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
            COUNT
            from exceptions_b e
            join profiles_b p on p.id=e.id
            join mimes m on m.mime_id=p.mime_id
            and e.parse_exception_id=0
            group by MIME_TYPE, e.sort_stack_trace
            order by MIME_TYPE asc, COUNT desc
        </sql>
    </report>
    <report reportName="extractExceptionsA"
            reportFilename="exceptions/extract_exceptions_a.xlsx"
            format="xlsx"
            includeSql="true">
        <sql>
            select file_path, extract_exception_description
            from extract_exceptions_a e
            join ref_extract_exception_types t
            on e.extract_exception_id=t.extract_exception_id
        </sql>
    </report>
    <report reportName="extractExceptionsB"
            reportFilename="exceptions/extract_exceptions_b.xlsx"
            format="xlsx"
            includeSql="true">
        <sql>
            select file_path, extract_exception_description
            from extract_exceptions_b e
            join ref_extract_exception_types t
            on e.extract_exception_id=t.extract_exception_id
        </sql>
    </report>
    <report reportName="parseExceptionTypesA"
            reportFilename="exceptions/overall_exception_types_a.xlsx"
            format="xlsx"
            includeSql="true">
        <sql>
            select parse_exception_description, count(1)
            from exceptions_a e
            join ref_parse_exception_types t on
            t.parse_exception_id=e.parse_exception_id
            group by e.parse_exception_id
        </sql>
    </report>
    <report reportName="parseExceptionTypesB"
            reportFilename="exceptions/overall_exception_types_b.xlsx"
            format="xlsx"
            includeSql="true">
        <sql>
            select parse_exception_description, count(1)
            from exceptions_b e
            join ref_parse_exception_types t on
            t.parse_exception_id=e.parse_exception_id
            group by e.parse_exception_id
        </sql>
    </report>

    <report reportName="contentDiffsWExceptions"
            reportFilename="content/content_diffs_with_exceptions.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select file_path,
            c.length as CONTAINER_LENGTH,
            ma.mime_string as MIME_STRING_A,
            mb.mime_string as MIME_STRING_B,
            ca.num_unique_tokens as NUM_UNIQUE_TOKENS_A,
            cb.num_unique_tokens as NUM_UNIQUE_TOKENS_B,
            ca.num_tokens as NUM_TOKENS_A,
            cb.num_tokens as NUM_TOKENS_B,
            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
            ifnull(cb.num_common_tokens,0)-
            ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
            ca.top_n_tokens as TOP_N_TOKENS_A,
            cb.top_n_tokens as TOP_N_TOKENS_B,
            ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
            cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
            top_10_unique_token_diffs_a,
            top_10_unique_token_diffs_b,
            top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap
            from content_comparisons cc
            join contents_a ca on ca.id=cc.id
            left join contents_b cb on cb.id=cc.id
            join profiles_a pa on pa.id = cc.id
            join profiles_b pb on pb.id=cc.id
            join containers c on c.container_id=pa.container_id
            join mimes ma on ma.mime_id=pa.mime_id
            join mimes mb on mb.mime_id=pb.mime_id
            left join exceptions_a ea on ea.id=cc.id
            left join exceptions_b eb on eb.id=cc.id
            where (overlap &lt; 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) &gt;30)
            and (ea.parse_exception_id is null or
            ea.parse_exception_id &lt;&gt; 2)
            and (eb.parse_exception_id is null or
            eb.parse_exception_id &lt;&gt; 2)
            order by ma.mime_string, overlap asc
            limit 100000
        </sql>
    </report>
    <report reportName="contentDiffsIgnoreExceptions"
            reportFilename="content/content_diffs_ignore_exceptions.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select file_path,
            c.length as CONTAINER_LENGTH,
            ma.mime_string as MIME_STRING_A,
            mb.mime_string as MIME_STRING_B,
            ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A,
            cb.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_B,
            ca.NUM_TOKENS as NUM_TOKENS_A,
            cb.NUM_TOKENS as NUM_TOKENS_B,
            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
            ifnull(cb.num_common_tokens,0)-
            ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
            ca.top_n_tokens as TOP_N_TOKENS_A,
            cb.top_n_tokens as TOP_N_TOKENS_B,
            ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
            cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
            top_10_unique_token_diffs_a,
            top_10_unique_token_diffs_b,
            top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap
            from content_comparisons cc
            join contents_a ca on ca.id=cc.id
            join contents_b cb on cb.id=cc.id
            join profiles_a pa on pa.id = cc.id
            join profiles_b pb on pb.id=cc.id
            join containers c on c.container_id=pa.container_id
            join mimes ma on ma.mime_id=pa.mime_id
            join mimes mb on mb.mime_id=pb.mime_id
            left join exceptions_a ea on ea.id=cc.id
            left join exceptions_b eb on eb.id=cc.id
            where (overlap &lt; 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) &gt;30)
            and (ea.parse_exception_id is null)
            and (eb.parse_exception_id is null)
            order by ma.mime_string, overlap asc
            limit 100000
        </sql>
    </report>

    <report reportName="CommonTokenComparisonsByMimeType"
            reportFilename="content/common_token_comparisons_by_mime.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select ma.mime_string as MIME_STRING_A, mb.mime_string as MIME_STRING_B,
            num_tokens_a, num_tokens_b,
            num_alphabetic_tokens_a, num_alphabetic_tokens_b,
            num_common_tokens_a, num_common_tokens_b,
            ifnull(num_common_tokens_b, 0)-ifnull(num_common_tokens_a, 0) as change_in_common_tokens_b
            from token_counts_compared tcc
            join mimes ma on tcc.mime_id_a = ma.mime_id
            join mimes mb on tcc.mime_id_b = mb.mime_id
            order by change_in_common_tokens_b desc
        </sql>
    </report>
    <report reportName="PageCountDiffs"
            reportFilename="content/page_count_diffs.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select file_path,
            c.length as CONTAINER_LENGTH,
            ma.mime_string as MIME_STRING_A,
            mb.mime_string as MIME_STRING_B,
            pa.num_pages as NUM_PAGES_A,
            pb.num_pages as NUM_PAGES_B,
            (pb.num_pages-pa.num_pages) as DIFF_NUM_PAGES_IN_B
            from profiles_a pa
            join profiles_b pb on pa.id = pb.id
            join containers c on pa.container_id=c.container_id
            join mimes ma on ma.mime_id=pa.mime_id
            join mimes mb on mb.mime_id=pb.mime_id
            where pa.num_pages is not null
            and pb.num_pages is not null
            and pa.num_pages &lt;&gt; pb.num_pages
            order by DIFF_NUM_PAGES_IN_B asc;
        </sql>
    </report>


    <report reportName="ExceptionComparisonsByMimeType"
            reportFilename="exceptions/exceptions_compared_by_mime_type.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select ma.mime_string, mb.mime_string, exceptions_a,
            total_a, percent_exceptions_a,
            exceptions_b, total_b, percent_exceptions_b
            from exceptions_compared c
            join mimes ma on ma.mime_id=c.mime_id_a
            join mimes mb on mb.mime_id=c.mime_id_b
            order by percent_exceptions_b desc, total_b desc;
        </sql>
    </report>
    <!--    <report reportName="MD5 Duplicate Counts A"
                reportFilename="md5/md5_duplicate_counts_A.xlsx"
                format="xlsx"
                            includeSql="true">
            <sql>
                select md5, count(1) cnt
                from profiles_a
                group by md5
                having cnt > 2
                order by cnt desc
            </sql>
        </report>

        <report reportName="MD5 Duplicate Counts B"
                reportFilename="md5/md5_duplicate_counts_B.xlsx"
                format="xlsx"
                            includeSql="true">

            <sql>
                select md5, count(1) cnt
                from profiles_b
                group by md5
                having cnt > 2
                order by cnt desc
            </sql>
        </report>

        <report reportName="MD5 Duplicates A"
                reportFilename="md5/md5_duplicates_A.xlsx"
                format="xlsx"
                            includeSql="true">

            <sql>
                select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5
                from md5_multiples_tmp_a t
                join profiles_a p on p.md5 = t.md5
                join containers c on p.container_id = c.container_id
                join contents_a cb on p.id=cb.id
                order by t.cnt desc
            </sql>
        </report>

        <report reportName="MD5 Duplicates B"
                reportFilename="md5/md5_duplicates_B.xlsx"
                format="xlsx"
                            includeSql="true">

            <sql>
                select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5
                from md5_multiples_tmp_b t
                join profiles_b p on p.md5 = t.md5
                join containers c on p.container_id = c.container_id
                join contents_b cb on p.id=cb.id
                order by t.cnt desc
            </sql>
        </report>
    -->

    <report reportName="Attachment Diffs"
            reportFilename="attachments/attachment_diffs.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select file_path,
            c.length as CONTAINER_LENGTH,
            ma.mime_string as MIME_STRING_A,
            mb.mime_string as MIME_STRING_B,
            pa.num_attachments as NUM_ATTACHMENTS_A,
            pb.num_attachments as NUM_ATTACHMENTS_B,
            ea.parse_exception_id as EXCEPTION_ID_A,
            eb.parse_exception_id as EXCEPTION_ID_B
            from profiles_a pa
            join profiles_b pb on pa.id= pb.id
            join containers c on pa.container_id=c.container_id
            join mimes ma on pa.mime_id=ma.mime_id
            join mimes mb on pb.mime_id=mb.mime_id
            left join exceptions_a ea on ea.id=pa.id
            left join exceptions_b eb on eb.id=pb.id
            where pa.is_embedded=false and
            ea.parse_exception_id is null and
            eb.parse_exception_id is null
            and pa.num_attachments &lt;&gt; pb.num_attachments
            order by ma.mime_string, pb.num_attachments-pa.num_attachments
            limit 1000;
        </sql>
    </report>

    <!-- metadata values -->
    <report reportName="Metadata Value Diffs"
            reportFilename="metadata/metadata_value_count_diffs.xlsx"
            format="xlsx"
            includeSql="true">

        <sql>
            select file_path,
            ma.mime_string as mime_string_a,
            mb.mime_string as mime_string_b,
            pa.num_metadata_values as num_metadata_values_a,
            pb.num_metadata_values as num_metadata_values_b,
            ea.parse_exception_id as parse_ex_id_a,
            eb.parse_exception_id as parse_ex_id_b
            from profiles_a pa
            join profiles_b pb on pa.id= pb.id
            join containers c on pa.container_id=c.container_id
            join mimes ma on pa.mime_id=ma.mime_id
            join mimes mb on pb.mime_id=mb.mime_id
            left join exceptions_a ea on ea.id=pa.id
            left join exceptions_b eb on eb.id=pb.id
            where
            ea.parse_exception_id is null and
            eb.parse_exception_id is null
            and pa.num_metadata_values &lt;&gt; pb.num_metadata_values
            order by ma.mime_string,
            pb.num_metadata_values-pa.num_metadata_values
        </sql>
    </report>

    <after>
        <sql>drop table if exists md5_multiples_tmp_a</sql>
        <sql>drop table if exists md5_multiples_tmp_b</sql>
    </after>
</reports>




© 2015 - 2025 Weber Informatics LLC | Privacy Policy