
ka.tika-eval-app.2.9.1.source-code.profile-reports.xml Maven / Gradle / Ivy
<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <reports> <before> <!-- <sql>create index on x</sql>--> </before> <!-- MIMES --> <report reportName="All Mimes" reportFilename="mimes/all_mimes.xlsx" format="xlsx" includeSql="true"> <sql> select mime_string, count(1) cnt from profiles p join mimes m on m.mime_id = p.mime_id group by mime_string order by cnt desc </sql> </report> <report reportName="Container Mimes" reportFilename="mimes/container_mimes.xlsx" format="xlsx" includeSql="true"> <sql> select mime_string, count(1) cnt from profiles p join mimes m on m.mime_id = p.mime_id where is_embedded=false group by mime_string order by cnt desc </sql> </report> <report reportName="Embedded Mimes" reportFilename="mimes/embedded_mimes.xlsx" format="xlsx" includeSql="true"> <sql> select mime_string, count(1) cnt from profiles p join mimes m on m.mime_id = p.mime_id where is_embedded=true group by mime_string order by cnt desc </sql> </report> <!-- content --> <report reportName="Common Tokens by Lang" reportFilename="content/common_tokens_by_lang.xlsx" format="xlsx" includeSql="true"> <sql> select common_tokens_lang, sum(num_common_tokens) as cnt from contents group by common_tokens_lang order by cnt desc; </sql> </report> <report reportName="Detected Languages" reportFilename="content/detected_langs.xlsx" format="xlsx" includeSql="true"> <sql> select LANG_ID_1 as DetectedLang, count(1) as cnt from contents group by LANG_ID_1 order by cnt desc </sql> </report> <report reportName="Token Count by Detected Language" reportFilename="content/num_tokens_by_detected_langs.xlsx" format="xlsx" includeSql="true"> <sql> select LANG_ID_1 as DetectedLang, sum(num_tokens) as cnt from contents group by LANG_ID_1 order by cnt desc; </sql> </report> <report reportName="Common Tokens Divided by Alphabetic Tokens" reportFilename="content/common_tokens_div_alphabetic_exclude_media_and_zips.xlsx" format="xlsx" includeSql="true"> <!-- 0.50 is a complete heuristic --> <sql> select file_path, file_name, is_embedded, mime_string, lang_id_1, common_tokens_lang, num_tokens, num_alphabetic_tokens, num_common_tokens, case when num_alphabetic_tokens > 0 then cast(num_common_tokens as double)/cast(num_alphabetic_tokens as double) else 0 end as common_div_alphabetic from contents c join profiles p on p.id=c.id join containers ct on ct.container_id=p.container_id join mimes m on p.mime_id=m.mime_id where (num_alphabetic_tokens = 0 or cast(num_common_tokens as double)/cast(num_alphabetic_tokens as double) < 0.50 ) and mime_string not like 'image%' and mime_string not like 'video%' and mime_string not like 'audio%' and mime_string not like 'application/zip' order by common_div_alphabetic asc limit 10000 </sql> </report> <!-- MSWord files do not usually store actual # of pages; rather, they store 1 or 0, and the actual number is calculated dynamically by the application when the file is loaded. This will lead to some crazily high tokens/page counts for MSWord files, but the focus of this query is on the low end. --> <report reportName="Tokens Per Page" reportFilename="content/tokens_per_page_in_container_files.xlsx" format="xlsx" includeSql="true"> <sql> select file_path, mime_string, num_tokens, num_pages, case when num_tokens = 0 then 0 else cast(num_tokens as double)/cast(num_pages as double) end as num_tokens_div_num_pages from profiles p left join contents c on p.id=c.id join mimes m on p.mime_id = m.mime_id join containers ct on p.container_id=ct.container_id where num_pages is not null and num_pages > 0 and is_embedded=false order by num_tokens_div_num_pages asc limit 1000 </sql> </report> <report reportName="Exceptions by Type" reportFilename="exceptions/exceptions_by_type.xlsx" format="xlsx" includeSql="true"> <sql> select parse_exception_description, count(1) cnt from parse_exceptions e join profiles p on p.id = e.id join ref_parse_exception_types et on et.parse_exception_id=e.parse_exception_id group by parse_exception_description order by cnt desc; </sql> </report> <report reportName="Embedded Exceptions by Type" reportFilename="exceptions/exceptions_by_type_embedded.xlsx" format="xlsx" includeSql="true"> <sql> select parse_exception_description, count(1) cnt from parse_exceptions e join profiles p on p.id = e.id join ref_parse_exception_types et on et.parse_exception_id=e.parse_exception_id where is_embedded=true group by parse_exception_description order by cnt desc; </sql> </report> <report reportName="AllExceptionsByMimeByType" reportFilename="exceptions/exceptions_by_mime_by_type.xlsx" format="xlsx" includeSql="true"> <sql> select mime_string as MIME_TYPE, parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT from parse_exceptions e join profiles p on p.id=e.id join containers c on p.container_id=c.container_id join mimes m on m.mime_id=p.mime_id join ref_parse_exception_types r on r.parse_exception_id=e.parse_exception_id group by p.mime_id, parse_exception_description order by MIME_TYPE, EXCEPTION_TYPE </sql> </report> <report reportName="StackTracesByMime" reportFilename="exceptions/stack_traces_by_mime.xlsx" format="xlsx" includeSql="true"> <sql> select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as COUNT from parse_exceptions e join profiles p on p.id=e.id join mimes m on m.mime_id=p.mime_id and e.parse_exception_id=0 group by MIME_TYPE, e.sort_stack_trace order by MIME_TYPE asc, COUNT desc </sql> </report> <report reportName="AllStackTraces" reportFilename="exceptions/stack_traces_details.xlsx" format="xlsx" includeSql="true"> <sql> select file_path, file_name, is_embedded, c.length as CONTAINER_LENGTH, mime_string as MIME_TYPE, orig_stack_trace, sort_stack_trace from parse_exceptions e join profiles p on p.id=e.id join containers c on p.container_id=c.container_id join mimes m on m.mime_id=p.mime_id and e.parse_exception_id=0 order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace, CONTAINER_LENGTH asc </sql> </report> <after> <!--<sql>drop index on x</sql> --> </after> </reports>
© 2015 - 2025 Weber Informatics LLC | Privacy Policy