All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.l3s.boilerpipe.document.TextBlock Maven / Gradle / Ivy

The newest version!
/**
 * boilerpipe
 *
 * Copyright (c) 2009 Christian Kohlschütter
 *
 * The author licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.l3s.boilerpipe.document;

import java.util.BitSet;
import java.util.HashSet;
import java.util.Set;

import de.l3s.boilerpipe.labels.DefaultLabels;

/**
 * Describes a block of text.
 * 
 * A block can be an "atomic" text element (i.e., a sequence of text that is not
 * interrupted by any HTML markup) or a compound of such atomic elements.
 * 
 * @author Christian Kohlschütter
 */
public class TextBlock implements Cloneable {
    boolean isContent = false;
    private CharSequence text;
    Set labels = null;

    int offsetBlocksStart;
    int offsetBlocksEnd;

    int numWords;
    int numWordsInAnchorText;
    int numWordsInWrappedLines;
    int numWrappedLines;
    float textDensity;
    float linkDensity;

    BitSet containedTextElements;

    private int numFullTextWords = 0;
	private int tagLevel;

    private static final BitSet EMPTY_BITSET = new BitSet();
    public static final TextBlock EMPTY_START = new TextBlock("", EMPTY_BITSET,
            0, 0, 0, 0, -1);
    public static final TextBlock EMPTY_END = new TextBlock("", EMPTY_BITSET,
            0, 0, 0, 0, Integer.MAX_VALUE);

    public TextBlock(final String text) {
        this(text, null, 0,0,0,0,0);
    }
    
    public TextBlock(final String text, final BitSet containedTextElements,
            final int numWords, final int numWordsInAnchorText,
            final int numWordsInWrappedLines, final int numWrappedLines,
            final int offsetBlocks) {
        this.text = text;
        this.containedTextElements = containedTextElements;
        this.numWords = numWords;
        this.numWordsInAnchorText = numWordsInAnchorText;
        this.numWordsInWrappedLines = numWordsInWrappedLines;
        this.numWrappedLines = numWrappedLines;
        this.offsetBlocksStart = offsetBlocks;
        this.offsetBlocksEnd = offsetBlocks;
        initDensities();
    }

    public boolean isContent() {
        return isContent;
    }

    public boolean setIsContent(boolean isContent) {
        if (isContent != this.isContent) {
            this.isContent = isContent;
            return true;
        } else {
            return false;
        }
    }

    public String getText() {
        return text.toString();
    }

    public int getNumWords() {
        return numWords;
    }

    public int getNumWordsInAnchorText() {
        return numWordsInAnchorText;
    }

    public float getTextDensity() {
        return textDensity;
    }

    public float getLinkDensity() {
        return linkDensity;
    }

    public void mergeNext(final TextBlock other) {
        if (!(text instanceof StringBuilder)) {
            text = new StringBuilder(text);
        }
        StringBuilder sb = (StringBuilder) text;
        sb.append('\n');
        sb.append(other.text);

        numWords += other.numWords;
        numWordsInAnchorText += other.numWordsInAnchorText;

        numWordsInWrappedLines += other.numWordsInWrappedLines;
        numWrappedLines += other.numWrappedLines;

        offsetBlocksStart = Math
                .min(offsetBlocksStart, other.offsetBlocksStart);
        offsetBlocksEnd = Math.max(offsetBlocksEnd, other.offsetBlocksEnd);

        initDensities();

        this.isContent |= other.isContent;

        if(containedTextElements == null) {
        	containedTextElements = (BitSet)other.containedTextElements.clone();
        } else {
        	containedTextElements.or(other.containedTextElements);
        }

        numFullTextWords += other.numFullTextWords;

        if (other.labels != null) {
            if (labels == null) {
                labels = new HashSet(other.labels);
            } else {
                labels.addAll(other.labels);
            }
        }
        
        tagLevel = Math.min(tagLevel, other.tagLevel);
    }

    private void initDensities() {
        if (numWordsInWrappedLines == 0) {
            numWordsInWrappedLines = numWords;
            numWrappedLines = 1;
        }
        textDensity = numWordsInWrappedLines / (float) numWrappedLines;
        linkDensity = numWords == 0 ? 0 : numWordsInAnchorText / (float) numWords;
    }

    public int getOffsetBlocksStart() {
        return offsetBlocksStart;
    }
    public int getOffsetBlocksEnd() {
        return offsetBlocksEnd;
    }

    public String toString() {
        return "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl="+tagLevel+"; nw="+numWords+";nwl="+numWrappedLines+";ld="+linkDensity+"]\t"
                + (isContent?"CONTENT":"boilerplate") + "," + labels + "\n" + getText();
    }

    /**
     * Adds an arbitrary String label to this {@link TextBlock}.
     * 
     * @param label The label
     * @see DefaultLabels
     */
    public void addLabel(final String label) {
        if (labels == null) {
            labels = new HashSet(2);
        }
        labels.add(label);
    }

    /**
     * Checks whether this TextBlock has the given label.
     * 
     * @param label The label
     * @return true if this block is marked by the given label.
     */
    public boolean hasLabel(final String label) {
        return labels != null && labels.contains(label);
    }
    
    public boolean removeLabel(final String label) {
    	return labels != null && labels.remove(label);
    }
    
    /**
     * Returns the labels associated to this TextBlock, or null if no such labels
     * exist.
     * 
     * NOTE: The returned instance is the one used directly in TextBlock. You have full access
     * to the data structure. However it is recommended to use the label-specific methods in {@link TextBlock}
     * whenever possible.
     * 
     * @return Returns the set of labels, or null if no labels was added yet.
     */
    public Set getLabels() {
        return labels;
    }
    
    /**
     * Adds a set of labels to this {@link TextBlock}.
     * null-references are silently ignored.
     * 
     * @param l The labels to be added. 
     */
    public void addLabels(final Set l) {
        if(l == null) {
            return;
        }
        if(this.labels == null) {
            this.labels = new HashSet(l);
        } else {
            this.labels.addAll(l);
        }
    }
    
    /**
     * Adds a set of labels to this {@link TextBlock}.
     * null-references are silently ignored.
     * 
     * @param l The labels to be added. 
     */
    public void addLabels(final String... l) {
        if(l == null) {
            return;
        }
        if(this.labels == null) {
            this.labels = new HashSet();
        }
        for(final String label : l) {
            this.labels.add(label);
        }
    }

    /**
     * Returns the containedTextElements BitSet, or null.
     * @return
     */
    public BitSet getContainedTextElements() {
        return containedTextElements;
    }

	@Override
	protected Object clone() {
		final TextBlock clone;
		try {
			clone = (TextBlock)super.clone();
		} catch (CloneNotSupportedException e) {
			throw new RuntimeException(e);
		}
		if(text != null && !(text instanceof String)) {
			clone.text = new StringBuilder(text);
		}
		if(labels != null && !labels.isEmpty()) {
			clone.labels = new HashSet(labels);
		}
		if(containedTextElements != null) {
			clone.containedTextElements = (BitSet)containedTextElements.clone();
		}
		
		return clone;
	}

	public int getTagLevel() {
		return tagLevel;
	}

	public void setTagLevel(int tagLevel) {
		this.tagLevel = tagLevel;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy