org.apache.tika.eval.app.tools.TopCommonTokenCounter Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.eval.app.tools;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;

import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;

import org.apache.tika.eval.core.tokens.AnalyzerManager;
import org.apache.tika.eval.core.tokens.URLEmailNormalizingFilterFactory;
import org.apache.tika.utils.ProcessUtils;

/**
 * Utility class that reads in a UTF-8 input file with one document per row
 * and outputs the 20000 tokens with the highest document frequencies.
 * 
 * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
 * but includes bigrams for cjk.
 * 
 * It also has a include list for __email__ and __url__ and a skip list
 * for common html markup terms.
 */
public class TopCommonTokenCounter {

    private static final String FIELD = "f";
    //these should exist in every list
    static Set INCLUDE_LIST = new HashSet<>(Arrays.asList(
            new String[]{URLEmailNormalizingFilterFactory.URL,
                    URLEmailNormalizingFilterFactory.EMAIL}));
    //words to ignore
    //these are common 4 letter html markup words that we do
    //not want to count in case of failed markup processing.
    //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
    static Set SKIP_LIST = new HashSet<>(
            Arrays.asList("span", "table", "href", "head", "title", "body", "html", "tagname",
                    "lang", "style", "script", "strong", "blockquote", "form", "iframe", "section",
                    "colspan", "rowspan"));
    private static String LICENSE =
            "# Licensed to the Apache Software Foundation (ASF) under one or more\n" +
                    "# contributor license agreements.  See the NOTICE file distributed with\n" +
                    "# this work for additional information regarding copyright ownership.\n" +
                    "# The ASF licenses this file to You under the Apache License, Version 2.0\n" +
                    "# (the \"License\"); you may not use this file except in compliance with\n" +
                    "# the License.  You may obtain a copy of the License at\n" + "#\n" +
                    "#     http://www.apache.org/licenses/LICENSE-2.0\n" + "#\n" +
                    "# Unless required by applicable law or agreed to in writing, software\n" +
                    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" +
                    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" +
                    "# See the License for the specific language governing permissions and\n" +
                    "# limitations under the License.\n" + "#\n";
    private static int TOP_N = 30000;
    private static int MIN_DOC_FREQ = 10;

    public static void main(String[] args) throws Exception {
        Path commonTokensFile = Paths.get(args[0]);
        List inputFiles = new ArrayList<>();
        for (int i = 1; i < args.length; i++) {
            inputFiles.add(Paths.get(ProcessUtils.unescapeCommandLine(args[i])));
        }
        TopCommonTokenCounter counter = new TopCommonTokenCounter();
        if (Files.exists(commonTokensFile)) {
            System.err.println(
                    commonTokensFile.getFileName().toString() + " exists. I'm skipping this.");
            return;
        }
        counter.execute(commonTokensFile, inputFiles);
    }

    private static void writeTopN(Path path, long totalDocs, long sumDocFreqs,
                                  long sumTotalTermFreqs, long uniqueTerms,
                                  AbstractTokenTFDFPriorityQueue queue) throws IOException {
        if (Files.isRegularFile(path)) {
            System.err.println("File " + path.getFileName() + " already exists. Skipping.");
            return;
        }
        Files.createDirectories(path.getParent());
        try (BufferedWriter writer = Files.newBufferedWriter(path, StandardCharsets.UTF_8)) {
            StringBuilder sb = new StringBuilder();
            writer.write(LICENSE);
            writer.write("#DOC_COUNT\t" + totalDocs + "\n");
            writer.write("#SUM_DOC_FREQS\t" + sumDocFreqs + "\n");
            writer.write("#SUM_TERM_FREQS\t" + sumTotalTermFreqs + "\n");
            writer.write("#UNIQUE_TERMS\t" + uniqueTerms + "\n");
            writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
            //add these tokens no matter what
            for (String t : INCLUDE_LIST) {
                writer.write(t);
                writer.newLine();
            }
            for (TokenDFTF tp : queue.getArray()) {
                writer.write(getRow(sb, tp) + "\n");

            }
            writer.flush();
        }
    }

    private static String getRow(StringBuilder sb, TokenDFTF tp) {
        sb.setLength(0);
        sb.append(clean(tp.token));
        sb.append("\t").append(tp.df);
        sb.append("\t").append(tp.tf);
        return sb.toString();
    }

    private static String clean(String s) {
        if (s == null) {
            return "";
        }
        return s.replaceAll("\\s+", " ").trim();
    }

    private void execute(Path commonTokensFile, List inputFiles) throws Exception {
        Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
        AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
        long totalDocs = -1;
        long sumDocFreqs = -1;
        long sumTotalTermFreqs = -1;
        long uniqueTerms = -1;
        try (Directory directory = FSDirectory.open(luceneDir)) {

            AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);

            Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
            int maxLen = 1000000;
            int len = 0;
            try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
                List docs = new ArrayList<>();
                for (Path inputFile : inputFiles) {
                    //total hack
                    boolean isLeipzig = inputFile.getFileName().toString().contains("-sentences.txt");
                    int lines = 0;
                    try (BufferedReader reader = getReader(inputFile)) {
                        String line = reader.readLine();
                        while (line != null) {
                            if (isLeipzig) {
                                int tab = line.indexOf("\t");
                                if (tab > -1) {
                                    line = line.substring(tab + 1);
                                }
                            }
                            len += line.length();
                            Document document = new Document();
                            document.add(new TextField(FIELD, line, Field.Store.NO));
                            docs.add(document);
                            if (len > maxLen) {
                                writer.addDocuments(docs);
                                docs.clear();
                                len = 0;
                            }
                            line = reader.readLine();
                            if (++lines % 100000 == 0) {
                                System.out.println(
                                        "processed " + lines + " for " + inputFile.getFileName() +
                                                " :: " + commonTokensFile.toAbsolutePath());
                            }
                        }
                    }
                }
                if (docs.size() > 0) {
                    writer.addDocuments(docs);
                }
                writer.commit();
                writer.flush();
            }

            try (IndexReader reader = DirectoryReader.open(directory)) {
                LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
                totalDocs = wrappedReader.getDocCount(FIELD);
                sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
                sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);

                Terms terms = wrappedReader.terms(FIELD);
                TermsEnum termsEnum = terms.iterator();
                BytesRef bytesRef = termsEnum.next();
                int docsWThisField = wrappedReader.getDocCount(FIELD);
                while (bytesRef != null) {
                    uniqueTerms++;
                    int df = termsEnum.docFreq();
                    long tf = termsEnum.totalTermFreq();
                    if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
                        bytesRef = termsEnum.next();
                        continue;
                    }

                    if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) {
                        String t = bytesRef.utf8ToString();
                        if (!SKIP_LIST.contains(t)) {
                            queue.insertWithOverflow(new TokenDFTF(t, df, tf));
                        }

                    }
                    bytesRef = termsEnum.next();
                }
            }
        } finally {
            FileUtils.deleteDirectory(luceneDir.toFile());
        }

        writeTopN(commonTokensFile, totalDocs, sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);


    }

    private BufferedReader getReader(Path inputFile) throws IOException {
        InputStream is = Files.newInputStream(inputFile);
        if (inputFile.toString().endsWith(".gz")) {
            is = new GzipCompressorInputStream(is);
        }
        return new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
    }

    private abstract static class AbstractTokenTFDFPriorityQueue extends PriorityQueue {

        AbstractTokenTFDFPriorityQueue(int maxSize) {
            super(maxSize);
        }

        public TokenDFTF[] getArray() {
            TokenDFTF[] topN = new TokenDFTF[size()];
            //now we reverse the queue
            TokenDFTF term = pop();
            int i = topN.length - 1;
            while (term != null && i > -1) {
                topN[i--] = term;
                term = pop();
            }
            return topN;
        }
    }

    private static class TokenDFTF {

        final String token;
        final int df;
        final long tf;

        public TokenDFTF(String token, int df, long tf) {
            this.token = token;
            this.df = df;
            this.tf = tf;
        }


        public long getTF() {
            return tf;
        }

        public int getDF() {
            return df;
        }

        public String getToken() {
            return token;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) {
                return true;
            }
            if (o == null || getClass() != o.getClass()) {
                return false;
            }

            TokenDFTF tokenDFTF = (TokenDFTF) o;

            if (df != tokenDFTF.df) {
                return false;
            }
            if (tf != tokenDFTF.tf) {
                return false;
            }
            return Objects.equals(token, tokenDFTF.token);
        }

        @Override
        public int hashCode() {
            int result = token != null ? token.hashCode() : 0;
            result = 31 * result + df;
            result = 31 * result + (int) (tf ^ (tf >>> 32));
            return result;
        }

        @Override
        public String toString() {
            return "TokenDFTF{" + "token='" + token + '\'' + ", df=" + df + ", tf=" + tf + '}';
        }
    }

    private static class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {

        TokenDFPriorityQueue(int maxSize) {
            super(maxSize);
        }

        @Override
        protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
            if (arg0.df < arg1.df) {
                return true;
            } else if (arg0.df > arg1.df) {
                return false;
            }
            return arg1.token.compareTo(arg0.token) < 0;
        }

        public TokenDFTF[] getArray() {
            TokenDFTF[] topN = new TokenDFTF[size()];
            //now we reverse the queue
            TokenDFTF term = pop();
            int i = topN.length - 1;
            while (term != null && i > -1) {
                topN[i--] = term;
                term = pop();
            }
            return topN;
        }
    }
}