org.odftoolkit.simple.common.TextExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of simple-odf Show documentation
A simple API for easy manipulation of ODF documents.
The newest version!
/* 
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.  The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License.  You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.  See the License for the
specific language governing permissions and limitations
under the License.
*/

package org.odftoolkit.simple.common;

import org.odftoolkit.odfdom.dom.DefaultElementVisitor;
import org.odftoolkit.odfdom.dom.OdfDocumentNamespace;
import org.odftoolkit.odfdom.dom.element.text.TextHElement;
import org.odftoolkit.odfdom.dom.element.text.TextLineBreakElement;
import org.odftoolkit.odfdom.dom.element.text.TextPElement;
import org.odftoolkit.odfdom.dom.element.text.TextSElement;
import org.odftoolkit.odfdom.dom.element.text.TextTabElement;
import org.odftoolkit.odfdom.pkg.OdfElement;
import org.odftoolkit.odfdom.pkg.OdfNamespace;
import org.w3c.dom.Node;

/**
 * This is a sub class of DefaultElementVisitor, which is used to
 * extract display text from ODF element. For example, if you want to get all of
 * the text content in a slide notes, you can call getOdfElement()
 * to get the ODF element of this notes, then pass it to
 * newOdfTextExtractor to create a TextExtractor. The last step is
 * very easy, you only need to use getText(), all of the text
 * content will be return as string. Another easier way is pass the ODF element
 * to the static method TextExtractor.getText(OdfElement) directly.
 * 
 * If you pass the content root which you can get by
 * {@link org.odftoolkit.simple.Document#getContentRoot()
 * Document.getContentRoot()} as the parameter, the whole document content will
 * be returned, without any tag information.
 * 

 * This extractor implements parts of ODF elements' white space handling
 * functions. They are text:p, text:h, text:s, text:tab and text:linebreak,
 * which visit() are override to process white space, according to
 * ODF specification.
 * 
 * @see org.odftoolkit.odfdom.pkg.OdfElement
 */
public class TextExtractor extends DefaultElementVisitor {

	protected static final char NewLineChar = '\n';
	protected static final char TabChar = '\t';
	protected final ExtractorStringBuilder mTextBuilder;
	OdfElement mElement;
	
	/**
	 * This class is used to provide the string builder functions to extractor.
	 * It will automatically process the last NewLineChar.
	 * 
	 * @since 0.3.5
	 */
	protected static class ExtractorStringBuilder {
		private StringBuilder mBuilder;
		private boolean lastAppendNewLine;

		ExtractorStringBuilder() {
			mBuilder = new StringBuilder();
			lastAppendNewLine = false;
		}

		/**
		 * Append a string
		 * 
		 * @param str
		 *            - the string
		 */
		public void append(String str) {
			mBuilder.append(str);
		}

		/**
		 * Append a character
		 * 
		 * @param ch
		 *            - the character
		 */
		public void append(char ch) {
			mBuilder.append(ch);
		}

		/**
		 * Append a new line character at the end
		 */
		public void appendLine() {
			mBuilder.append(NewLineChar);
			lastAppendNewLine = true;
		}

		/**
		 * Return the string value.
		 * 
		 * If the last character is a new line character and is appended with
		 * appendLine(), the last new line character will be removed.
		 */
		public String toString() {
			if (lastAppendNewLine) {
				mBuilder.deleteCharAt(mBuilder.length() - 1);
			}
			return mBuilder.toString();
		}
	}

	/**
	 * Return the text content of a element as String
	 * 
	 * @param ele
	 *            the ODF element
	 * @return the text content of the element
	 */
	public static synchronized String getText(OdfElement ele) {
		TextExtractor extractor = newOdfTextExtractor(ele);
		return extractor.getText();
	}

	/**
	 * Create a TextExtractor instance using specified ODF element, which text
	 * content can be extracted by getText().
	 * 
	 * @param element
	 *            the ODF element whose text will be extracted.
	 * @return an instance of TextExtractor
	 */
	public static TextExtractor newOdfTextExtractor(OdfElement element) {
		return new TextExtractor(element);
	}

	/**
	 * Return the text content of specified ODF element as a string.
	 * 
	 * @return the text content as a string
	 */
	public String getText() {
		visit(mElement);
		return mTextBuilder.toString();
	}

	/**
	 * Default constructor
	 */
	protected TextExtractor() {
		mTextBuilder = new ExtractorStringBuilder();
	}

	/**
	 * Constructor with an ODF element as parameter
	 * 
	 * @param element
	 *            the ODF element whose text would be extracted.
	 */
	protected TextExtractor(OdfElement element) {
		mTextBuilder = new ExtractorStringBuilder();
		mElement = element;
	}

	/**
	 * The end users needn't to care of this method, if you don't want to
	 * override the text content handling strategy of OdfElement.
	 * 
	 * @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.pkg.OdfElement)
	 */
	@Override
	public void visit(OdfElement element) {
		appendElementText(element);
		if (OdfDocumentNamespace.META.getUri().equals(element.getNamespaceURI())
				|| OdfDocumentNamespace.DC.getUri().equals(element.getNamespaceURI())) {
			mTextBuilder.appendLine();
		}
	}

	/**
	 * The end users needn't to care of this method, if you don't want to
	 * override the text content handling strategy of text:p.
	 * 
	 * @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.text.TextPElement)
	 */
	@Override
	public void visit(TextPElement ele) {
		appendElementText(ele);
		mTextBuilder.appendLine();
	}

	/**
	 * The end users needn't to care of this method, if you don't want to
	 * override the text content handling strategy of text:h.
	 * 
	 * @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.text.TextHElement)
	 */
	@Override
	public void visit(TextHElement ele) {
		appendElementText(ele);
		mTextBuilder.appendLine();
	}

	/**
	 * The end users needn't to care of this method, if you don't want to
	 * override the text content handling strategy of text:s.
	 * 
	 * @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.text.TextSElement)
	 */
	@Override
	public void visit(TextSElement ele) {
		Integer count = ele.getTextCAttribute();
		if (count == null) {
			count = 1;
		}
		for (int i = 0; i < count; i++) {
			mTextBuilder.append(' ');
		}
	}

	/**
	 * The end users needn't to care of this method, if you don't want to
	 * override the text content handling strategy of text:tab.
	 * 
	 * @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.text.TextTabElement)
	 */
	@Override
	public void visit(TextTabElement ele) {
		mTextBuilder.append(TabChar);
	}

	/**
	 * The end users needn't to care of this method, if you don't want to
	 * override the text content handling strategy of text:linebreak.
	 * 
	 * @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.text.TextLineBreakElement)
	 */
	@Override
	public void visit(TextLineBreakElement ele) {
		mTextBuilder.append(NewLineChar);
	}

	/**
	 * Append the text content of this element to string buffer.
	 * 
	 * @param ele
	 *            the ODF element whose text will be appended.
	 */
	protected void appendElementText(OdfElement ele) {
		Node node = ele.getFirstChild();
		while (node != null) {
			if (node.getNodeType() == Node.TEXT_NODE) {
				mTextBuilder.append(node.getNodeValue());
			} else if (node.getNodeType() == Node.ELEMENT_NODE) {
				OdfElement element = (OdfElement) node;
				element.accept(this);
			}
			node = node.getNextSibling();
		}
	}
}