All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.isotrol.impe3.idx.oc.extractors.ExtractorUtil Maven / Gradle / Ivy

The newest version!
/**
 * This file is part of Port@l
 * Port@l 3.0 - Portal Engine and Management System
 * Copyright (C) 2010  Isotrol, SA.  http://www.isotrol.com
 *
 * Port@l is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Port@l is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Port@l.  If not, see .
 */
package com.isotrol.impe3.idx.oc.extractors;


/**
 * Clase de utilidad
 * @author Juan Manuel Valverde Ramírez
 */
public class ExtractorUtil {

	private ExtractorUtil() {
		//
	}

	/**
	 * Removes "unwanted" control chars from the given content.

* * @param content the content to remove the unwanted control chars from * * @return the content with the unwanted control chars removed */ protected static String removeControlChars(String content) { if (content == null || content.trim().length() == 0) { // to avoid later null pointer exceptions an empty String is returned return ""; } char[] chars = content.toCharArray(); StringBuffer result = new StringBuffer(chars.length); boolean wasUnwanted = false; for (int i = 0; i < chars.length; i++) { char ch = chars[i]; int type = Character.getType(ch); switch (type) { // punctuation case Character.CURRENCY_SYMBOL: case Character.CONNECTOR_PUNCTUATION: case Character.FINAL_QUOTE_PUNCTUATION: case Character.INITIAL_QUOTE_PUNCTUATION: case Character.DASH_PUNCTUATION: case Character.START_PUNCTUATION: case Character.END_PUNCTUATION: case Character.OTHER_PUNCTUATION: // letters case Character.OTHER_LETTER: case Character.MODIFIER_LETTER: case Character.UPPERCASE_LETTER: case Character.TITLECASE_LETTER: case Character.LOWERCASE_LETTER: // digits case Character.DECIMAL_DIGIT_NUMBER: // spaces case Character.SPACE_SEPARATOR: result.append(ch); wasUnwanted = false; break; // line separators case Character.LINE_SEPARATOR: result.append('\n'); wasUnwanted = true; break; // symbols case Character.MATH_SYMBOL: case Character.OTHER_SYMBOL: // other stuff: case Character.CONTROL: case Character.COMBINING_SPACING_MARK: case Character.ENCLOSING_MARK: case Character.FORMAT: case Character.LETTER_NUMBER: case Character.MODIFIER_SYMBOL: case Character.NON_SPACING_MARK: case Character.PARAGRAPH_SEPARATOR: case Character.PRIVATE_USE: case Character.SURROGATE: case Character.UNASSIGNED: case Character.OTHER_NUMBER: default: if (!wasUnwanted) { result.append('\n'); wasUnwanted = true; } } } return result.toString(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy