All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.eval.app.tools.TopCommonTokenCounter Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.eval.app.tools;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;

import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;

import org.apache.tika.eval.core.tokens.AnalyzerManager;
import org.apache.tika.eval.core.tokens.URLEmailNormalizingFilterFactory;
import org.apache.tika.utils.ProcessUtils;

/**
 * Utility class that reads in a UTF-8 input file with one document per row
 * and outputs the 20000 tokens with the highest document frequencies.
 * 

* The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters, * but includes bigrams for cjk. *

* It also has a include list for __email__ and __url__ and a skip list * for common html markup terms. */ public class TopCommonTokenCounter { private static final String FIELD = "f"; //these should exist in every list static Set INCLUDE_LIST = new HashSet<>(Arrays.asList( new String[]{URLEmailNormalizingFilterFactory.URL, URLEmailNormalizingFilterFactory.EMAIL})); //words to ignore //these are common 4 letter html markup words that we do //not want to count in case of failed markup processing. //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055 static Set SKIP_LIST = new HashSet<>( Arrays.asList("span", "table", "href", "head", "title", "body", "html", "tagname", "lang", "style", "script", "strong", "blockquote", "form", "iframe", "section", "colspan", "rowspan")); private static String LICENSE = "# Licensed to the Apache Software Foundation (ASF) under one or more\n" + "# contributor license agreements. See the NOTICE file distributed with\n" + "# this work for additional information regarding copyright ownership.\n" + "# The ASF licenses this file to You under the Apache License, Version 2.0\n" + "# (the \"License\"); you may not use this file except in compliance with\n" + "# the License. You may obtain a copy of the License at\n" + "#\n" + "# http://www.apache.org/licenses/LICENSE-2.0\n" + "#\n" + "# Unless required by applicable law or agreed to in writing, software\n" + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" + "# See the License for the specific language governing permissions and\n" + "# limitations under the License.\n" + "#\n"; private static int TOP_N = 30000; private static int MIN_DOC_FREQ = 10; public static void main(String[] args) throws Exception { Path commonTokensFile = Paths.get(args[0]); List inputFiles = new ArrayList<>(); for (int i = 1; i < args.length; i++) { inputFiles.add(Paths.get(ProcessUtils.unescapeCommandLine(args[i]))); } TopCommonTokenCounter counter = new TopCommonTokenCounter(); if (Files.exists(commonTokensFile)) { System.err.println( commonTokensFile.getFileName().toString() + " exists. I'm skipping this."); return; } counter.execute(commonTokensFile, inputFiles); } private static void writeTopN(Path path, long totalDocs, long sumDocFreqs, long sumTotalTermFreqs, long uniqueTerms, AbstractTokenTFDFPriorityQueue queue) throws IOException { if (Files.isRegularFile(path)) { System.err.println("File " + path.getFileName() + " already exists. Skipping."); return; } Files.createDirectories(path.getParent()); try (BufferedWriter writer = Files.newBufferedWriter(path, StandardCharsets.UTF_8)) { StringBuilder sb = new StringBuilder(); writer.write(LICENSE); writer.write("#DOC_COUNT\t" + totalDocs + "\n"); writer.write("#SUM_DOC_FREQS\t" + sumDocFreqs + "\n"); writer.write("#SUM_TERM_FREQS\t" + sumTotalTermFreqs + "\n"); writer.write("#UNIQUE_TERMS\t" + uniqueTerms + "\n"); writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n"); //add these tokens no matter what for (String t : INCLUDE_LIST) { writer.write(t); writer.newLine(); } for (TokenDFTF tp : queue.getArray()) { writer.write(getRow(sb, tp) + "\n"); } writer.flush(); } } private static String getRow(StringBuilder sb, TokenDFTF tp) { sb.setLength(0); sb.append(clean(tp.token)); sb.append("\t").append(tp.df); sb.append("\t").append(tp.tf); return sb.toString(); } private static String clean(String s) { if (s == null) { return ""; } return s.replaceAll("\\s+", " ").trim(); } private void execute(Path commonTokensFile, List inputFiles) throws Exception { Path luceneDir = Files.createTempDirectory("tika-eval-lucene-"); AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N); long totalDocs = -1; long sumDocFreqs = -1; long sumTotalTermFreqs = -1; long uniqueTerms = -1; try (Directory directory = FSDirectory.open(luceneDir)) { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1); Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); int maxLen = 1000000; int len = 0; try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) { List docs = new ArrayList<>(); for (Path inputFile : inputFiles) { //total hack boolean isLeipzig = inputFile.getFileName().toString().contains("-sentences.txt"); int lines = 0; try (BufferedReader reader = getReader(inputFile)) { String line = reader.readLine(); while (line != null) { if (isLeipzig) { int tab = line.indexOf("\t"); if (tab > -1) { line = line.substring(tab + 1); } } len += line.length(); Document document = new Document(); document.add(new TextField(FIELD, line, Field.Store.NO)); docs.add(document); if (len > maxLen) { writer.addDocuments(docs); docs.clear(); len = 0; } line = reader.readLine(); if (++lines % 100000 == 0) { System.out.println( "processed " + lines + " for " + inputFile.getFileName() + " :: " + commonTokensFile.toAbsolutePath()); } } } } if (docs.size() > 0) { writer.addDocuments(docs); } writer.commit(); writer.flush(); } try (IndexReader reader = DirectoryReader.open(directory)) { LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader); totalDocs = wrappedReader.getDocCount(FIELD); sumDocFreqs = wrappedReader.getSumDocFreq(FIELD); sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD); Terms terms = wrappedReader.terms(FIELD); TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int docsWThisField = wrappedReader.getDocCount(FIELD); while (bytesRef != null) { uniqueTerms++; int df = termsEnum.docFreq(); long tf = termsEnum.totalTermFreq(); if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) { bytesRef = termsEnum.next(); continue; } if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) { String t = bytesRef.utf8ToString(); if (!SKIP_LIST.contains(t)) { queue.insertWithOverflow(new TokenDFTF(t, df, tf)); } } bytesRef = termsEnum.next(); } } } finally { FileUtils.deleteDirectory(luceneDir.toFile()); } writeTopN(commonTokensFile, totalDocs, sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue); } private BufferedReader getReader(Path inputFile) throws IOException { InputStream is = Files.newInputStream(inputFile); if (inputFile.toString().endsWith(".gz")) { is = new GzipCompressorInputStream(is); } return new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); } private abstract static class AbstractTokenTFDFPriorityQueue extends PriorityQueue { AbstractTokenTFDFPriorityQueue(int maxSize) { super(maxSize); } public TokenDFTF[] getArray() { TokenDFTF[] topN = new TokenDFTF[size()]; //now we reverse the queue TokenDFTF term = pop(); int i = topN.length - 1; while (term != null && i > -1) { topN[i--] = term; term = pop(); } return topN; } } private static class TokenDFTF { final String token; final int df; final long tf; public TokenDFTF(String token, int df, long tf) { this.token = token; this.df = df; this.tf = tf; } public long getTF() { return tf; } public int getDF() { return df; } public String getToken() { return token; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } TokenDFTF tokenDFTF = (TokenDFTF) o; if (df != tokenDFTF.df) { return false; } if (tf != tokenDFTF.tf) { return false; } return Objects.equals(token, tokenDFTF.token); } @Override public int hashCode() { int result = token != null ? token.hashCode() : 0; result = 31 * result + df; result = 31 * result + (int) (tf ^ (tf >>> 32)); return result; } @Override public String toString() { return "TokenDFTF{" + "token='" + token + '\'' + ", df=" + df + ", tf=" + tf + '}'; } } private static class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue { TokenDFPriorityQueue(int maxSize) { super(maxSize); } @Override protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) { if (arg0.df < arg1.df) { return true; } else if (arg0.df > arg1.df) { return false; } return arg1.token.compareTo(arg0.token) < 0; } public TokenDFTF[] getArray() { TokenDFTF[] topN = new TokenDFTF[size()]; //now we reverse the queue TokenDFTF term = pop(); int i = topN.length - 1; while (term != null && i > -1) { topN[i--] = term; term = pop(); } return topN; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy