de.dnb.oai.harvester.util.HarvesterUtils Maven / Gradle / Ivy

Go to download
/**********************************************************************
 * Class HarvesterUtils
 *  
 * Copyright (c) 2005-2012, German National Library / Deutsche Nationalbibliothek
 * Adickesallee 1, D-60322 Frankfurt am Main, Federal Republic of Germany 
 *
 * This program is free software.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * Kadir Karaca Kocer -- German National Library
 * 
 **********************************************************************/

package de.dnb.oai.harvester.util;

import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;

import de.dnb.oai.harvester.HarvesterHttpException;

/** ********************************************************************
 * Some common utility functions.
 * 
 * @author Kadir Karaca Kocer, German National Library
 * @version 20090220
 **********************************************************************/

/* ********************************************************************
 * CHANGELOG:
 *   
 * Created on 22.09.2008 15:24:39 by kocer
 ********************************************************************/
public class HarvesterUtils
{
	/**
	  getFormattedChecksum() method
	  @param resource Content to calculate the checksum as byte array.
	  @return Formatted hexadecimal notation of its hash value.

	  Format a raw array of binary bytes as a hexadecimal string.
	  Based on:
	  "Copyright (c) 2007 by Keith Fenske. GNU Public License.";
	 * @throws NoSuchAlgorithmException 
	*/
	public static String getFormattedChecksum(byte[] resource) throws NoSuchAlgorithmException {
		java.security.MessageDigest sha1digest = MessageDigest.getInstance("SHA1");
		
		byte[] raw = sha1digest.digest(resource);
		int i; // index variable
		final char[] hexDigits = {'0', '1', '2', '3', '4', '5', '6', '7', '8',
					'9', 'a', 'b', 'c', 'd', 'e', 'f' }; // for converting binary to hexadecimal
			String result; // our result (the checksum as a string)
	
			result = ""; // start result with an empty string
			for (i = 0; i < raw.length; i++) {
				result += hexDigits[(raw[i] >> 4) & 0x0F]; // hex high-order nibble
				result += hexDigits[raw[i] & 0x0F]; // hex low-order nibble
			}
			return (result); // return hexadecimal string that we created

	  } // end of method

	/** *******************************************************************
	 * Checks if a string is in a list of strings.
	 * 
	 * @param str String to look if its already in the list (ignoring whitespaces)
	 * @param list List of strings to check if it contains the defined string
	 * @return true if the string is in the list, false otherwise.
	 * @author Kadir Karaca Kocer, German National Library
	 */
	public static boolean isStringInList(String str, String[] list){
		//ignore whitespaces
		String tmpStr = str.trim();
		for (int i = 0; i < list.length; i++){
			if (tmpStr.equalsIgnoreCase(list[i].trim())){ 
				return true;
			}
		}
		return false;
	}
	
	/***********************************************************************
	 * Returns the blank separated list of words in an Array.
	 * Used for indexing the OCR-Text-Files
	 * 
	 * @param rawOCR Byte array containing the OCR Data.
	 * @param encoding Type of text encoding
	 * @return Blank separated list of words in text.
	 * @throws UnsupportedEncodingException 
	 * @throws IOException
	 * @author Kadir Karaca Kocer, German National Library
	 */
	public static String normaliseText(byte[] rawOCR, String encoding)
				throws UnsupportedEncodingException, IOException  {
		String rawText = new String (rawOCR, encoding);
		String normText = ""; //the word-only content of the ocr text.
		rawText = rawText.replaceAll("\r", "");
		rawText = rawText.replaceAll("-\n", "");
		rawText = rawText.replaceAll("\\.\\.", " ");
		rawText = rawText.replaceAll(" \\.", " ");
		rawText = rawText.replaceAll("--", " ");
		
		StreamTokenizer st = new StreamTokenizer(new java.io.StringReader(rawText));
		
		st.parseNumbers();
		st.eolIsSignificant(true);
		st.lowerCaseMode(true);

		for ( int tval; (tval = st.nextToken()) != StreamTokenizer.TT_EOF; ) {
			if ( tval == StreamTokenizer.TT_WORD ){
				if (st.sval.length() > 2) normText = normText + " " + st.sval;
			}	
		}

		return normText;
	}
	
	/***********************************************************************
	 * Returns the length of a Stream.
	 * 
	 * @param in InputStream to count the length.
	 * @return Length of the stream.
	 * @throws IOException
	 * @author Kadir Karaca Kocer, German National Library
	 */
	public static int countStreamLength(java.io.InputStream in) throws IOException {
		int length = 0;
		while (in.read() != -1) {
			length++;
		}

		return length;
	}

	/***********************************************************************
	 * Converts a Stream to a ByteArray.
	 * 
	 * @param in InputStream to convert.
	 * @return ByteArray containing the contents of the stream.
	 * @throws IOException
	 * @author Kadir Karaca Kocer, German National Library
	 */
	public static byte[] convertStreamToByteArray(java.io.InputStream in) throws IOException {
		byte[] content = new byte[countStreamLength(in)];
		int i = 0;
		int bytes = 0;
		
		while ((bytes = in.read()) != -1) {
			Integer num = new Integer(bytes);
			content[i] = num.byteValue();
			i++;
		}

		return content;
	}
	
	/***********************************************************************
	 * Returns the content of a text file as a String.
	 * 
	 * @version 20060225
	 * @param file File to read
	 * @param encoding Encoding of the text. Default is "UTF-8" if called with null.
	 * @throws java.io.IOException
	 * @return Contents of the file as a String
	 * @author Kadir Karaca Kocer, German National Library
	 * @since 20060225
	 **********************************************************************/
	public static String readFile(java.io.File file, String encoding)throws java.io.IOException {
		//Default is "UTF-8" if called with null or empty string
		String enc;
		if (encoding == null || encoding.equals("")) {
			enc = "UTF-8";
		} else {
			enc = encoding;
		}
		
		String text = "";
		java.io.FileInputStream in = null;
		try {
			in = new java.io.FileInputStream(file);
			int fl = (int) file.length();
			byte buffer[] = new byte[fl];
			int len = in.read(buffer, 0, fl);
			//Read all the text file with the given encoding
			text = new String(buffer, 0, len, enc);
	    }
	    finally {
	        if ( in != null ) in.close();
	    } 
	    return text;
	}
	
	/**********************************************************************
	 * Download the contents of a given URL.
	 * 
	 * @param url URL of the document to download.
	 * @return Whole document as a byte array
	 * @throws IOException 
	 * @throws HttpException 
	 * @throws HarvesterHttpException 
	 * @author Kadir Karaca Kocer, German National Library
	 **********************************************************************/
	public static byte[] downloadFile(String url) throws HttpException, IOException, HarvesterHttpException {
		byte[] responseBody = null;
		if ((url != null) && !(url.equals(""))) {
			// use Apache library to get the file
			// code from http://hc.apache.org/httpclient-3.x/tutorial.html
			// Create an instance of HttpClient.
		    HttpClient client = new HttpClient();

		    // Create a method instance.
		    GetMethod method = new GetMethod(url);
		    
		    // Provide custom retry handler is necessary
		    // method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false));
			try {
				// Execute the method.
			    int statusCode = client.executeMethod(method);
			    
			    if (statusCode != HttpStatus.SC_OK) {
			    	throw new HarvesterHttpException("Error connecting URL: " + url +
			    	                                   "\nStatus code= " + statusCode +
			    	                                   "\nMessage=" + method.getStatusLine());
			    }
			    // Read the response body.
			    responseBody = method.getResponseBody();
			}
			finally {
			      // Release the connection.
			      method.releaseConnection();
			}
		}
		return responseBody;
	}
	
	/**********************************************************************
	 * Downloads the contents of a given URL using given encoding.
	 * 
	 * @param url URL of the document to download.
	 * @param encoding Encoding of the document (Java String default)
	 * @return Whole document as a string 
	 * @throws IOException 
	 * @throws HttpException 
	 * @throws UnsupportedEncodingException 
	 * @throws HarvesterHttpException
	 * @author Kadir Karaca Kocer, German National Library
	 **********************************************************************/
	public static String downloadFile(String url, String encoding)
	              throws UnsupportedEncodingException, HttpException, IOException, HarvesterHttpException{
		String result = "";
		// if an encoding is already defined
		if ((encoding != null) && !(encoding.equals(""))) {
			// create the string with the given encoding
			result = new String(downloadFile(url), encoding);
		} else {
			// if not use Java default
			result = new String(downloadFile(url));
		}
		return result;
	}
	
	/**********************************************************************
	 * Adds the given text at the end of given file.
	 * 
	 * @param f File to add the text
	 * @param text Text to add. 
	 * @throws IOException
	 * @author Kadir Karaca Kocer, German National Library
	 **********************************************************************/
	public static void appendToTextFile(java.io.File f, String text) throws IOException{
		java.io.FileWriter fw = new java.io.FileWriter (f, true);
		fw.append(text);
		fw.close();
	}
	
	/** *******************************************************************
	 * Returns a array of strings which contains the given substring.
	 * 
	 * @param completeList The original list of strings to search in.
	 * @param content The substring to search in the list of strings.
	 * @return A list of strings that contains the given substring.
	 * @author Kadir Karaca Kocer, German National Library
	 **********************************************************************/
	public static String[] selectEntries(String[] completeList, String content) {
		//define a new ArrayList becouse we don't know how many entries we will find
		java.util.ArrayList selectedEntries = new java.util.ArrayList();
		// make a loop over all the strings contained in the given list
		for (int i=0; i < completeList.length - 1; i++){
			// check if this string contains the given content string
			if (completeList[i].contains(content)){
				// yes! add it to the list of selected string
				selectedEntries.add(completeList[i]);
			}
		}
		return selectedEntries.toArray(new String[selectedEntries.size()]);
	}
	
	/** ******************************************************************
	 * Extracts the links from given HTML document.
	 * 
	 * @param htmlDocument String representation of the document to parse.
	 * @return All the links in this document as a string array
	 * @author Kadir Karaca Kocer, German National Library
	 **********************************************************************/
	public static String[] extractLinksFromHTML(String htmlDocument) {
		// define an array list cause we don't know how many links exists
		java.util.ArrayList linkList = new java.util.ArrayList();
		String tmpStr = "";
		// split the document to pieces starting with HTML construct 
			tmpStr = links[i + 1].split("\">")[0];
			if ((tmpStr != null) && !(tmpStr.equals(""))) {
				// add to the list
				linkList.add(tmpStr.trim());
			}
		}
		return linkList.toArray(new String[linkList.size()]);
	}
} //End of Class