org.apache.tika.eval.tools.TopCommonTokenCounter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-eval Show documentation
There is a newer version: 3.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.eval.tools;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.tika.eval.tokens.AnalyzerManager;
import org.apache.tika.eval.tokens.URLEmailNormalizingFilterFactory;
import org.apache.tika.utils.ProcessUtils;

/**
 * Utility class that reads in a UTF-8 input file with one document per row
 * and outputs the 20000 tokens with the highest document frequencies.
 *
 * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
 * but includes bigrams for cjk.
 *
 * It also has a white list for __email__ and __url__ and a black list
 * for common html markup terms.
 */
public class TopCommonTokenCounter {

    private static String LICENSE =
            "# Licensed to the Apache Software Foundation (ASF) under one or more\n" +
            "# contributor license agreements.  See the NOTICE file distributed with\n" +
            "# this work for additional information regarding copyright ownership.\n" +
            "# The ASF licenses this file to You under the Apache License, Version 2.0\n" +
            "# (the \"License\"); you may not use this file except in compliance with\n" +
            "# the License.  You may obtain a copy of the License at\n" +
            "#\n" +
            "#     http://www.apache.org/licenses/LICENSE-2.0\n" +
            "#\n" +
            "# Unless required by applicable law or agreed to in writing, software\n" +
            "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" +
            "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" +
            "# See the License for the specific language governing permissions and\n" +
            "# limitations under the License.\n"+
            "#\n";

    private static final String FIELD = "f";
    private static int TOP_N = 30000;
    private static int MIN_DOC_FREQ = 10;
    //these should exist in every list
    static Set WHITE_LIST = new HashSet<>(Arrays.asList(
            new String[] {
                    URLEmailNormalizingFilterFactory.URL,
                    URLEmailNormalizingFilterFactory.EMAIL
            }
    ));

    //words to ignore
    //these are common 4 letter html markup words that we do
    //not want to count in case of failed markup processing.
    //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
    static Set BLACK_LIST = new HashSet<>(Arrays.asList(
            "span",
            "table",
            "href",
            "head",
            "title",
            "body",
            "html",
            "tagname",
            "lang",
            "style",
            "script",
            "strong",
            "blockquote",
            "form",
            "iframe",
            "section",
            "colspan",
            "rowspan"
    ));

    public static void main(String[] args) throws Exception {
        Path commonTokensFile = Paths.get(args[0]);
        List inputFiles = new ArrayList<>();
        for (int i = 1; i < args.length; i++) {
            inputFiles.add(Paths.get(
                    ProcessUtils.unescapeCommandLine(args[i])));
        }
        TopCommonTokenCounter counter = new TopCommonTokenCounter();
        if (Files.exists(commonTokensFile)) {
            System.err.println(commonTokensFile.getFileName().toString()+
                    " exists. I'm skipping this.");
            return;
        }
        counter.execute(commonTokensFile, inputFiles);
    }

    private void execute(Path commonTokensFile, List inputFiles) throws Exception {
        Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
        AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
        long totalDocs = -1;
        long sumDocFreqs = -1;
        long sumTotalTermFreqs = -1;
        long uniqueTerms = -1;
        try (Directory directory = FSDirectory.open(luceneDir)) {

            AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);

            Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
            int maxLen = 1000000;
            int len = 0;
            try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
                List docs = new ArrayList<>();
                for (Path inputFile : inputFiles) {
                    //total hack
                    boolean isLeipzig = false;
                    if (inputFile.getFileName().toString().contains("-sentences.txt")) {
                        isLeipzig = true;
                    }
                    int lines = 0;
                    try (BufferedReader reader = getReader(inputFile)) {
                        String line = reader.readLine();
                        while (line != null) {
                            if (isLeipzig) {
                                int tab = line.indexOf("\t");
                                if (tab > -1) {
                                    line = line.substring(tab+1);
                                }
                            }
                            len += line.length();
                            Document document = new Document();
                            document.add(new TextField(FIELD, line, Field.Store.NO));
                            docs.add(document);
                            if (len > maxLen) {
                                writer.addDocuments(docs);
                                docs.clear();
                                len = 0;
                            }
                            line = reader.readLine();
                            if (++lines % 100000 == 0) {
                                System.out.println("processed "+lines +
                                        " for "+inputFile.getFileName()
                                + " :: "+ commonTokensFile.toAbsolutePath());
                            }
                        }
                    }
                }
                if (docs.size() > 0) {
                    writer.addDocuments(docs);
                }
                writer.commit();
                writer.flush();
            }

            try (IndexReader reader = DirectoryReader.open(directory)) {
                LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
                totalDocs = wrappedReader.getDocCount(FIELD);
                sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
                sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);

                Terms terms = wrappedReader.terms(FIELD);
                TermsEnum termsEnum = terms.iterator();
                BytesRef bytesRef = termsEnum.next();
                int docsWThisField = wrappedReader.getDocCount(FIELD);
                while (bytesRef != null) {
                    uniqueTerms++;
                    int df = termsEnum.docFreq();
                    long tf = termsEnum.totalTermFreq();
                    if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
                        bytesRef = termsEnum.next();
                        continue;
                    }

                    if (queue.top() == null || queue.size() < TOP_N ||
                            df >= queue.top().df) {
                        String t = bytesRef.utf8ToString();
                        if (! BLACK_LIST.contains(t)) {
                            queue.insertWithOverflow(new TokenDFTF(t, df, tf));
                        }

                    }
                    bytesRef = termsEnum.next();
                }
            }
        } finally {
            FileUtils.deleteDirectory(luceneDir.toFile());
        }

        writeTopN(commonTokensFile, totalDocs,
                sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);


    }

    private BufferedReader getReader(Path inputFile) throws IOException {
        InputStream is = Files.newInputStream(inputFile);
        if (inputFile.toString().endsWith(".gz")) {
            is = new GzipCompressorInputStream(is);
        }
        return new BufferedReader(
                new InputStreamReader(is, StandardCharsets.UTF_8)
        );
    }

    private static void writeTopN(Path path,
                                  long totalDocs, long sumDocFreqs,
                                  long sumTotalTermFreqs,
                                  long uniqueTerms, AbstractTokenTFDFPriorityQueue queue) throws IOException {
        if (Files.isRegularFile(path)) {
            System.err.println("File "+path.getFileName() + " already exists. Skipping.");
            return;
        }
        Files.createDirectories(path.getParent());
        BufferedWriter writer =
                Files.newBufferedWriter(path, StandardCharsets.UTF_8);
        StringBuilder sb = new StringBuilder();
        writer.write(LICENSE);
        writer.write("#DOC_COUNT\t"+totalDocs+"\n");
        writer.write("#SUM_DOC_FREQS\t"+sumDocFreqs+"\n");
        writer.write("#SUM_TERM_FREQS\t"+sumTotalTermFreqs+"\n");
        writer.write("#UNIQUE_TERMS\t"+uniqueTerms+"\n");
        writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
        //add these tokens no matter what
        for (String t : WHITE_LIST) {
            writer.write(t);
            writer.newLine();
        }
        for (TokenDFTF tp : queue.getArray()) {
            writer.write(getRow(sb, tp)+"\n");

        }
        writer.flush();
        writer.close();
    }

    private static String getRow(StringBuilder sb, TokenDFTF tp) {
        sb.setLength(0);
        sb.append(clean(tp.token));
        sb.append("\t").append(tp.df);
        sb.append("\t").append(tp.tf);
        return sb.toString();
    }

    private static String clean(String s) {
        if (s == null) {
            return "";
        }
        return s.replaceAll("\\s+", " ").trim();
    }

    private abstract class AbstractTokenTFDFPriorityQueue extends PriorityQueue {

        AbstractTokenTFDFPriorityQueue(int maxSize) {
            super(maxSize);
        }

        public TokenDFTF[] getArray() {
            TokenDFTF[] topN = new TokenDFTF[size()];
            //now we reverse the queue
            TokenDFTF term = pop();
            int i = topN.length-1;
            while (term != null && i > -1) {
                topN[i--] = term;
                term = pop();
            }
            return topN;
        }
    }

    private class TokenDFTF {

        final String token;
        final int df;
        final long tf;

        public TokenDFTF(String token, int df, long tf) {
            this.token = token;
            this.df = df;
            this.tf = tf;
        }


        public long getTF() {
            return tf;
        }

        public int getDF() {
            return df;
        }

        public String getToken() {
            return token;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (o == null || getClass() != o.getClass()) return false;

            TokenDFTF tokenDFTF = (TokenDFTF) o;

            if (df != tokenDFTF.df) return false;
            if (tf != tokenDFTF.tf) return false;
            return token != null ? token.equals(tokenDFTF.token) : tokenDFTF.token == null;
        }

        @Override
        public int hashCode() {
            int result = token != null ? token.hashCode() : 0;
            result = 31 * result + df;
            result = 31 * result + (int) (tf ^ (tf >>> 32));
            return result;
        }

        @Override
        public String toString() {
            return "TokenDFTF{" +
                    "token='" + token + '\'' +
                    ", df=" + df +
                    ", tf=" + tf +
                    '}';
        }
    }

    private class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {

        TokenDFPriorityQueue(int maxSize) {
            super(maxSize);
        }

        @Override
        protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
            if (arg0.df < arg1.df) {
                return true;
            } else if (arg0.df > arg1.df) {
                return false;
            }
            return arg1.token.compareTo(arg0.token) < 0;
        }

        public TokenDFTF[] getArray() {
            TokenDFTF[] topN = new TokenDFTF[size()];
            //now we reverse the queue
            TokenDFTF term = pop();
            int i = topN.length-1;
            while (term != null && i > -1) {
                topN[i--] = term;
                term = pop();
            }
            return topN;
        }
    }
}