org.apache.stanbol.commons.opennlp.TextAnalyzer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.apache.stanbol.commons.opennlp Show documentation
Provides a Bundle and support for the management of Models. The Data File Provider infrastructure is used to load requested models. Some Modles for English are included. Other models MUST be provided via the DataFileProvider infrastrucutre. (e.g. by including them in there classpath and providing an own DataFileProvider or by users adding the requred files to the "/datafiles" folder in the Stanbol installation)
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.commons.opennlp;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import opennlp.tools.chunker.Chunker;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.Span;

import org.apache.felix.scr.annotations.Reference;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
 * @deprecated replaced by STANBOL-733 (stanbol nlp processing module
 */
public class TextAnalyzer {
    
    private static final Logger log = LoggerFactory.getLogger(TextAnalyzer.class);
    @Reference
    private final OpenNLP openNLP;
    
    private final TextAnalyzerConfig config;
    
    /**
     * @deprecated replaced by STANBOL-733 (stanbol nlp processing module
     */
    public static final class TextAnalyzerConfig {
        protected boolean forceSimpleTokenizer = false; //default to false
        protected boolean forceKeywordTokenizer = false; //default to false
        protected boolean enablePosTagger = true;
        protected boolean enableChunker = true;
        protected boolean enableSentenceDetector = true;
        protected boolean enablePosTypeChunker = true;
        protected boolean forcePosTypeChunker = true;
        /**
         * The minimum POS type probability used by the PosTypeChunker
         */
        private double minPosTagProbability = 0.75;
        public final boolean isSimpleTokenizerForced() {
            return forceSimpleTokenizer;
        }
    
        public final void forceSimpleTokenizer(boolean useSimpleTokenizer) {
            this.forceSimpleTokenizer = useSimpleTokenizer;
            if(useSimpleTokenizer){
                this.forceKeywordTokenizer = false;
            }
        }
        public final boolean isKeywordTokenizerForced() {
            return forceKeywordTokenizer;
        }
    
        public final void forceKeywordTokenizer(boolean useKeywordTokenizer) {
            this.forceKeywordTokenizer = useKeywordTokenizer;
            if(useKeywordTokenizer){
                this.forceSimpleTokenizer = false;
            }
        }
    
        public final boolean isPosTaggerEnable() {
            return enablePosTagger;
        }
    
        public final void enablePosTagger(boolean enablePosTagger) {
            this.enablePosTagger = enablePosTagger;
        }
    
        public final boolean isChunkerEnabled() {
            return enableChunker;
        }
    
        public final void enableChunker(boolean enableChunker) {
            this.enableChunker = enableChunker;
        }
    
        public final boolean isSentenceDetectorEnabled() {
            return enableSentenceDetector;
        }
    
        public final void enableSentenceDetector(boolean enableSentenceDetector) {
            this.enableSentenceDetector = enableSentenceDetector;
        }
        public final boolean isPosTypeChunkerEnabled() {
            return enablePosTypeChunker;
        }
        /**
         * Enables the used of the {@link PosTypeChunker} if no {@link Chunker} for
         * the current {@link #getLanguage() language} is available.
         * @param enablePosTypeChunker
         */
        public final void enablePosTypeChunker(boolean enablePosTypeChunker) {
            this.enablePosTypeChunker = enablePosTypeChunker;
            if(!enablePosTypeChunker){
                forcePosTypeChunker(enablePosTypeChunker);
            }
        }
    
        public final boolean isPosTypeChunkerForced() {
            return forcePosTypeChunker;
        }
        /**
         * Forces the use of the {@link PosTypeChunker} even if a {@link Chunker}
         * for the current language would be available
         * @param forcePosTypeChunker
         */
        public final void forcePosTypeChunker(boolean forcePosTypeChunker) {
            this.forcePosTypeChunker = forcePosTypeChunker;
            if(forcePosTypeChunker) {
                enablePosTypeChunker(true);
            }
        }
    
        /**
         * Getter for the minimum POS tag probability so that the
         * {@link PosTypeChunker} processes a POS tag.
         * @return the minPosTypeProbability
         */
        public final double getMinPosTypeProbability() {
            return minPosTagProbability;
        }
    
        /**
         * Setter for the minimum POS tag probability so that the
         * {@link PosTypeChunker} processes a POS tag.
         * @param minPosTagProbability The probability [0..1] or value < 0 to 
         * deactivate this feature
         * @throws IllegalArgumentException if values > 1 are parsed as probability
         */
        public final void setMinPosTagProbability(double probability) {
            if(probability > 1){
                throw new IllegalArgumentException("The minimum POS tag probability MUST be set to a value <= 1 (parsed:"+minPosTagProbability+"");
            }
            this.minPosTagProbability = probability;
        }

    }
    
    private POSTaggerME posTagger;
    /**
     * used to ensure that {@link #openNLP} is only ask once for the {@link POSTaggerME}
     * of the parsed {@link #language}
     */
    private boolean posTaggerNotAvailable;
    private SentenceDetector sentenceDetector;
    /**
     * used to ensure that {@link #openNLP} is only ask once for the {@link SentenceDetector}
     * of the parsed {@link #language}
     */
    private boolean sentenceDetectorNotAvailable;
    private ChunkerME chunker;
    /**
     * used to ensure that {@link #openNLP} is only ask once for the {@link ChunkerME}
     * of the parsed {@link #language}
     */
    private boolean chunkerNotAvailable;
    private PosTypeChunker posTypeChunker;
    /**
     * used to ensure only a single try to init a {@link PosTypeChunker} for 
     * the parsed {@link #language}
     */
    private boolean posTypeChunkerNotAvailable;
    /**
     * The Tokenizer
     */
    private Tokenizer tokenizer;
    /**
     * The language
     */
    private final String language;


    /**
     * Creates a TextAnalyzer based on the OpenNLP and the given language and the
     * default {@link TextAnalyzerConfig configuration}.
     * If null is parsed as language, than a minimal configuration
     * that tokenizes the text using the {@link SimpleTokenizer} is used. 
     * @param openNLP The openNLP configuration to be used to analyze the text
     * @param language the language or null if not known.
     */
    public TextAnalyzer(OpenNLP openNLP,String language){
        this(openNLP,language,null);
    }
    /**
     * Creates a TextAnalyzer based on the OpenNLP and the given language.

     * If null is parsed as language, than a minimal configuration
     * that tokenizes the text using the {@link SimpleTokenizer} is used. 
     * @param openNLP The openNLP configuration to be used to analyze the text
     * @param language the language or null if not known.
     */
    public TextAnalyzer(OpenNLP openNLP,String language, TextAnalyzerConfig config){
        if(openNLP == null){
            throw new IllegalArgumentException("The OpenNLP component MUST NOT be NULL");
        }
        this.config = config == null ? new TextAnalyzerConfig() : config;
        this.openNLP = openNLP;
        this.language = language;
    }

    protected final POSTaggerME getPosTagger() {
        if(!config.enablePosTagger){
            return null;
        }
        if(posTagger == null && !posTaggerNotAvailable){
            try {
                POSModel posModel = openNLP.getPartOfSpeechModel(language);
                if(posModel != null){
                    posTagger = new POSTaggerME(posModel);
                } else {
                    log.debug("No POS Model for language '{}'",language);
                    posTaggerNotAvailable = true;
                }
            } catch (IOException e) {
                log.info("Unable to load POS Model for language '"+language+"'",e);
                posTaggerNotAvailable = true;
            }
        }
        return posTagger;
    }
    /**
     * Getter for the Tokenizer of a given language
     * @param language the language
     * @return the Tolenizer
     */
    public final Tokenizer getTokenizer(){
        if(tokenizer == null){
            if(config.forceSimpleTokenizer){
                tokenizer = SimpleTokenizer.INSTANCE;
            } else if(config.forceKeywordTokenizer){
                tokenizer = KeywordTokenizer.INSTANCE;
            } else {
                tokenizer = openNLP.getTokenizer(language);
                if(tokenizer == null){
                    log.debug("No Tokenizer for Language '{}': fall back to SimpleTokenizer!",language);
                    tokenizer = SimpleTokenizer.INSTANCE;
                }
            }
        }
        return tokenizer;
    }
    protected final ChunkerME getChunker(){
        if(!config.enableChunker || config.forcePosTypeChunker){
            return null;
        }
        if(chunker == null && !chunkerNotAvailable) {
            try {
                ChunkerModel chunkerModel = openNLP.getChunkerModel(language);
                if(chunkerModel != null){
                    chunker = new ChunkerME(chunkerModel);
                } else {
                    log.debug("No Chunker Model for language {}",language);
                    chunkerNotAvailable = true;
                }
            } catch (IOException e) {
                log.info("Unable to load Chunker Model for language "+language,e);
                chunkerNotAvailable = true;
            }
        }
        return chunker;
    }
    protected final PosTypeChunker getPosTypeChunker(){
        if(!config.enableChunker || !config.enablePosTagger){
            return null;
        }
        if(posTypeChunker == null && !posTypeChunkerNotAvailable){
            posTypeChunker = PosTypeChunker.getInstance(language,config.minPosTagProbability);
            posTypeChunkerNotAvailable = posTypeChunker == null;
        }
        return posTypeChunker;
    }

    protected final SentenceDetector getSentenceDetector() {
        if(!config.enableSentenceDetector){
            return null;
        }
        if(sentenceDetector == null && !sentenceDetectorNotAvailable){
            try {
                SentenceModel sentModel = openNLP.getSentenceModel(language);
                if(sentModel != null){
                    sentenceDetector = new SentenceDetectorME(sentModel);
                } else {
                    log.debug("No Sentence Detection Model for language '{}'",language);
                    sentenceDetectorNotAvailable = true;
                }
            } catch (IOException e) {
                log.info("Unable to load Sentence Detection Model for language '"+language+"'",e);
                sentenceDetectorNotAvailable = true;
            }
        }
        return sentenceDetector;
    }

    public final OpenNLP getOpenNLP() {
        return openNLP;
    }
    /**
     * @return the config
     */
    public final TextAnalyzerConfig getConfig() {
        return config;
    }
    /**
     * @return the language
     */
    public final String getLanguage() {
        return language;
    }


    /**
     * Analyses the parsed text in a single chunk. No sentence detector is used
     * @param sentence the sentence (text) to analyse
     * @return the Analysed text
     */
    public AnalysedText analyseSentence(String sentence){
        return new AnalysedText(sentence,language);
    }
    /**
     * Analyses sentence by sentence when {@link Iterator#next()} is called on
     * the returned Iterator. Changes to the configuration of this class will
     * have an effect on the analysis results of this iterator.
     * if no sentence detector is available the whole text is parsed at once. 
     * @param text The text to analyse
     * @return Iterator the analyses the parsed text sentence by sentence on
     * calls to {@link Iterator#next()}.
     */
    public Iterator analyse(String text){
        return new TextAnalysisIterator(text, language);
    }
    
    /**
     * @deprecated replaced by STANBOL-733 (stanbol nlp processing module
     */
    private final class TextAnalysisIterator implements Iterator {
        private final String text;
        private final Span[] sentenceSpans;
        private int current = 0;
        private final String language;
        private TextAnalysisIterator(String text,String language){
            this.text = text;
            this.language = language;
            if(text == null || text.isEmpty()){
                sentenceSpans = new Span[]{};
            } else {
                SentenceDetector sd = getSentenceDetector();
                if(sd != null){
                    sentenceSpans = sd.sentPosDetect(text);
                } else {
                    sentenceSpans = new Span[]{new Span(0, text.length())};
                }
            }
        }
        @Override
        public boolean hasNext() {
            return sentenceSpans.length > current;
        }

        @Override
        public AnalysedText next() {
            Span sentenceSpan = sentenceSpans[current];
            String sentence = sentenceSpan.getCoveredText(text).toString();
            current++; //mark this as consumed and navigate to the next
            return new AnalysedText(sentence,language,sentenceSpan.getStart());
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException(
                "Removal of Sentences of the prsed Text is not supported!");
        }
    }

    /**
     * @deprecated replaced by STANBOL-733 (stanbol nlp processing module
     */
    public final class AnalysedText {
        //NOTE: Members are protected to allow the JVM direct access
        /**
         * The analysed sentence
         */
        protected final String sentence;
        /**
         * Final and {@link Collections#unmodifiableList(List) unmodifiable list}
         * with the tokens of the analysed {@link #sentence}.
         */
        protected final List tokens;
        /**
         * Final and {@link Collections#unmodifiableList(List) unmodifiable list}
         * with the chunks of the analysed {@link #sentence} or null
         * of no chunks are available
         */
        protected final List chunks;
        /**
         * The offset of the sentence with respect to the whole text. Note that
         * {@link AnalysedText this class} only holds the offset and no reference
         * to the whole text. 0 indicates that this represents the
         * start of the text (this may also indicate that the {@link #sentence} 
         * represents the whole analysed text).
         */
        private final int offset;
        /**
         * The language of the analyzed text
         */
        protected String language;
         
        private AnalysedText(String sentence, String language){
            this(sentence,language,0);
        }
        private AnalysedText(String sentence,String language, int offset){
            if(sentence == null || sentence.isEmpty()){
                throw new IllegalArgumentException(
                    "The parsed Sentence MUST NOT be NULL nor empty!");
            }
            this.sentence = sentence;
            if(language == null || language.isEmpty()){
                throw new IllegalArgumentException("The parsed language MUST NOT be NULL nor empty");
            }
            this.language = language;
            if(offset < 0){
                throw new IllegalArgumentException(
                    "The parsed offset MUST NOT be a negative number (offset="+offset+")");
            }
            this.offset = offset;
            Span[] tokenSpans = getTokenizer().tokenizePos(sentence);
            POSTaggerME tagger = getPosTagger();
            ChunkerME chunker = getChunker();
            PosTypeChunker posTypeChunker = getPosTypeChunker();
            String[] tokens = new String[tokenSpans.length];
            for(int ti = 0; ti 0 && p.equals(actPos[0]);
                        if(!done){
                            actPos[j] = p;
                            actProp[j] = posSequences[j].getProbs()[i];
                            j++;
                        }
                    }
                    posTags[i] = new String[j];
                    System.arraycopy(actPos, 0, posTags[i], 0, j);
                    posProbs[i] = new double[j];
                    System.arraycopy(actProp, 0, posProbs[i], 0, j);
                }
                //posProbs = tagger.probs();
                if(chunker != null){
                    //we still need the Array of the best ranked POS tags for the chunker
                    String[] pos = posSequences[0].getOutcomes().toArray(new String[tokens.length]);
                    chunkSpans = chunker.chunkAsSpans(tokens, pos);
                    chunkProps = chunker.probs();
                } else if(posTypeChunker != null){
                    chunkSpans = posTypeChunker.chunkAsSpans(tokens, posTags, posProbs);
                    chunkProps = new double[chunkSpans.length];
                    Arrays.fill(chunkProps, 1.0);
                } else {
                    chunkSpans = null;
                    chunkProps = null;
                }
            } else {
                posTags = null;
                posProbs = null;
                chunkSpans = null;
                chunkProps = null;
            }
            List tokenList = new ArrayList(tokenSpans.length);
            for(int i=0;i chunkList = new ArrayList(chunkSpans.length);
                for(int i=0;i getTokens(){
            return tokens;
        }
        public List getChunks(){
            return chunks;
        }
        public String getText(){
            return sentence;
        }
        public String getLanguage(){
            return language;
        }
        
        /**
         * Getter for the Offset of this Sentence relative to the whole analysed
         * Text. 0 if there is no offset this analysed text represents
         * the whole content
         * @return the offset
         */
        public int getOffset() {
            return offset;
        }

        /**
         * @deprecated replaced by STANBOL-733 (stanbol nlp processing module
         */
        public final class Token {
            //NOTE: Members are protected to allow the JVM direct access
            protected final Span span;
            protected String token;
            protected final String[] posTags;
            protected final double[] posProbabilities;
            protected final boolean hasAlphaNumeric;

            private Token(Span span,String token,String pos,double posProbability){
                this(span,token,new String[]{pos},new double[] {posProbability});
            }
            private Token(Span span,String token,String[] posTags, double[] posProbabilities){
                this.span = span;
                if(posTags == null || posTags.length < 1){
                    this.posTags = null;
                } else {
                    this.posTags = posTags;
                }
                this.token = token;
                if(this.posTags == null){
                    this.posProbabilities = null;
                } else if(posTags.length != posProbabilities.length){
                    throw new IllegalStateException("POS Tag array and POS probability array MUST BE of the same size!");
                } else {
                    this.posProbabilities = posProbabilities;
                }
                boolean foundAlphaNumericCahr = false;
                for(int i = 0;!foundAlphaNumericCahr &&i __chunkTokens;

            private Chunk(Span chunkSpan,double probability){
                this.chunkSpan = chunkSpan;
                this.span =  new Span(tokens.get(chunkSpan.getStart()).getStart(), 
                    tokens.get(chunkSpan.getEnd()).getEnd());
                this.probability = probability;
            }
            public List getTokens(){
                if(__chunkTokens == null){
                    __chunkTokens = tokens.subList(chunkSpan.getStart(), chunkSpan.getEnd());
                }
                return __chunkTokens;
            }
            /**
             * @return the span
             */
            public int getStart() {
                return chunkSpan.getStart();
            }
            public int getEnd(){
                return chunkSpan.getEnd();
            }
            public int getSize(){
                return chunkSpan.length();
            }
            /**
             * @return the probability
             */
            public double getProbability() {
                return probability;
            }
            /**
             * The text of this chunk
             * @return
             */
            public String getText(){
                if(__text == null){
                    __text = span.getCoveredText(sentence).toString();
                }
                return __text;
            }
            @Override
            public String toString() {
                return getText();
            }
        }
    }    
}