de.dnb.oai.harvester.util.HarvesterUtils Maven / Gradle / Ivy
/**********************************************************************
* Class HarvesterUtils
*
* Copyright (c) 2005-2012, German National Library / Deutsche Nationalbibliothek
* Adickesallee 1, D-60322 Frankfurt am Main, Federal Republic of Germany
*
* This program is free software.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Kadir Karaca Kocer -- German National Library
*
**********************************************************************/
package de.dnb.oai.harvester.util;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import de.dnb.oai.harvester.HarvesterHttpException;
/** ********************************************************************
* Some common utility functions.
*
* @author Kadir Karaca Kocer, German National Library
* @version 20090220
**********************************************************************/
/* ********************************************************************
* CHANGELOG:
*
* Created on 22.09.2008 15:24:39 by kocer
********************************************************************/
public class HarvesterUtils
{
/**
getFormattedChecksum() method
@param resource Content to calculate the checksum as byte array.
@return Formatted hexadecimal notation of its hash value.
Format a raw array of binary bytes as a hexadecimal string.
Based on:
"Copyright (c) 2007 by Keith Fenske. GNU Public License.";
* @throws NoSuchAlgorithmException
*/
public static String getFormattedChecksum(byte[] resource) throws NoSuchAlgorithmException {
java.security.MessageDigest sha1digest = MessageDigest.getInstance("SHA1");
byte[] raw = sha1digest.digest(resource);
int i; // index variable
final char[] hexDigits = {'0', '1', '2', '3', '4', '5', '6', '7', '8',
'9', 'a', 'b', 'c', 'd', 'e', 'f' }; // for converting binary to hexadecimal
String result; // our result (the checksum as a string)
result = ""; // start result with an empty string
for (i = 0; i < raw.length; i++) {
result += hexDigits[(raw[i] >> 4) & 0x0F]; // hex high-order nibble
result += hexDigits[raw[i] & 0x0F]; // hex low-order nibble
}
return (result); // return hexadecimal string that we created
} // end of method
/** *******************************************************************
* Checks if a string is in a list of strings.
*
* @param str String to look if its already in the list (ignoring whitespaces)
* @param list List of strings to check if it contains the defined string
* @return true
if the string is in the list, false
otherwise.
* @author Kadir Karaca Kocer, German National Library
*/
public static boolean isStringInList(String str, String[] list){
//ignore whitespaces
String tmpStr = str.trim();
for (int i = 0; i < list.length; i++){
if (tmpStr.equalsIgnoreCase(list[i].trim())){
return true;
}
}
return false;
}
/***********************************************************************
* Returns the blank separated list of words in an Array.
* Used for indexing the OCR-Text-Files
*
* @param rawOCR Byte array containing the OCR Data.
* @param encoding Type of text encoding
* @return Blank separated list of words in text.
* @throws UnsupportedEncodingException
* @throws IOException
* @author Kadir Karaca Kocer, German National Library
*/
public static String normaliseText(byte[] rawOCR, String encoding)
throws UnsupportedEncodingException, IOException {
String rawText = new String (rawOCR, encoding);
String normText = ""; //the word-only content of the ocr text.
rawText = rawText.replaceAll("\r", "");
rawText = rawText.replaceAll("-\n", "");
rawText = rawText.replaceAll("\\.\\.", " ");
rawText = rawText.replaceAll(" \\.", " ");
rawText = rawText.replaceAll("--", " ");
StreamTokenizer st = new StreamTokenizer(new java.io.StringReader(rawText));
st.parseNumbers();
st.eolIsSignificant(true);
st.lowerCaseMode(true);
for ( int tval; (tval = st.nextToken()) != StreamTokenizer.TT_EOF; ) {
if ( tval == StreamTokenizer.TT_WORD ){
if (st.sval.length() > 2) normText = normText + " " + st.sval;
}
}
return normText;
}
/***********************************************************************
* Returns the length of a Stream.
*
* @param in InputStream to count the length.
* @return Length of the stream.
* @throws IOException
* @author Kadir Karaca Kocer, German National Library
*/
public static int countStreamLength(java.io.InputStream in) throws IOException {
int length = 0;
while (in.read() != -1) {
length++;
}
return length;
}
/***********************************************************************
* Converts a Stream to a ByteArray.
*
* @param in InputStream to convert.
* @return ByteArray containing the contents of the stream.
* @throws IOException
* @author Kadir Karaca Kocer, German National Library
*/
public static byte[] convertStreamToByteArray(java.io.InputStream in) throws IOException {
byte[] content = new byte[countStreamLength(in)];
int i = 0;
int bytes = 0;
while ((bytes = in.read()) != -1) {
Integer num = new Integer(bytes);
content[i] = num.byteValue();
i++;
}
return content;
}
/***********************************************************************
* Returns the content of a text file as a String.
*
* @version 20060225
* @param file File to read
* @param encoding Encoding of the text. Default is "UTF-8" if called with null.
* @throws java.io.IOException
* @return Contents of the file as a String
* @author Kadir Karaca Kocer, German National Library
* @since 20060225
**********************************************************************/
public static String readFile(java.io.File file, String encoding)throws java.io.IOException {
//Default is "UTF-8" if called with null or empty string
String enc;
if (encoding == null || encoding.equals("")) {
enc = "UTF-8";
} else {
enc = encoding;
}
String text = "";
java.io.FileInputStream in = null;
try {
in = new java.io.FileInputStream(file);
int fl = (int) file.length();
byte buffer[] = new byte[fl];
int len = in.read(buffer, 0, fl);
//Read all the text file with the given encoding
text = new String(buffer, 0, len, enc);
}
finally {
if ( in != null ) in.close();
}
return text;
}
/**********************************************************************
* Download the contents of a given URL.
*
* @param url URL of the document to download.
* @return Whole document as a byte array
* @throws IOException
* @throws HttpException
* @throws HarvesterHttpException
* @author Kadir Karaca Kocer, German National Library
**********************************************************************/
public static byte[] downloadFile(String url) throws HttpException, IOException, HarvesterHttpException {
byte[] responseBody = null;
if ((url != null) && !(url.equals(""))) {
// use Apache library to get the file
// code from http://hc.apache.org/httpclient-3.x/tutorial.html
// Create an instance of HttpClient.
HttpClient client = new HttpClient();
// Create a method instance.
GetMethod method = new GetMethod(url);
// Provide custom retry handler is necessary
// method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false));
try {
// Execute the method.
int statusCode = client.executeMethod(method);
if (statusCode != HttpStatus.SC_OK) {
throw new HarvesterHttpException("Error connecting URL: " + url +
"\nStatus code= " + statusCode +
"\nMessage=" + method.getStatusLine());
}
// Read the response body.
responseBody = method.getResponseBody();
}
finally {
// Release the connection.
method.releaseConnection();
}
}
return responseBody;
}
/**********************************************************************
* Downloads the contents of a given URL using given encoding.
*
* @param url URL of the document to download.
* @param encoding Encoding of the document (Java String default)
* @return Whole document as a string
* @throws IOException
* @throws HttpException
* @throws UnsupportedEncodingException
* @throws HarvesterHttpException
* @author Kadir Karaca Kocer, German National Library
**********************************************************************/
public static String downloadFile(String url, String encoding)
throws UnsupportedEncodingException, HttpException, IOException, HarvesterHttpException{
String result = "";
// if an encoding is already defined
if ((encoding != null) && !(encoding.equals(""))) {
// create the string with the given encoding
result = new String(downloadFile(url), encoding);
} else {
// if not use Java default
result = new String(downloadFile(url));
}
return result;
}
/**********************************************************************
* Adds the given text at the end of given file.
*
* @param f File to add the text
* @param text Text to add.
* @throws IOException
* @author Kadir Karaca Kocer, German National Library
**********************************************************************/
public static void appendToTextFile(java.io.File f, String text) throws IOException{
java.io.FileWriter fw = new java.io.FileWriter (f, true);
fw.append(text);
fw.close();
}
/** *******************************************************************
* Returns a array of strings which contains the given substring.
*
* @param completeList The original list of strings to search in.
* @param content The substring to search in the list of strings.
* @return A list of strings that contains the given substring.
* @author Kadir Karaca Kocer, German National Library
**********************************************************************/
public static String[] selectEntries(String[] completeList, String content) {
//define a new ArrayList becouse we don't know how many entries we will find
java.util.ArrayList selectedEntries = new java.util.ArrayList();
// make a loop over all the strings contained in the given list
for (int i=0; i < completeList.length - 1; i++){
// check if this string contains the given content string
if (completeList[i].contains(content)){
// yes! add it to the list of selected string
selectedEntries.add(completeList[i]);
}
}
return selectedEntries.toArray(new String[selectedEntries.size()]);
}
/** ******************************************************************
* Extracts the links from given HTML document.
*
* @param htmlDocument String representation of the document to parse.
* @return All the links in this document as a string array
* @author Kadir Karaca Kocer, German National Library
**********************************************************************/
public static String[] extractLinksFromHTML(String htmlDocument) {
// define an array list cause we don't know how many links exists
java.util.ArrayList linkList = new java.util.ArrayList();
String tmpStr = "";
// split the document to pieces starting with HTML construct
tmpStr = links[i + 1].split("\">")[0];
if ((tmpStr != null) && !(tmpStr.equals(""))) {
// add to the list
linkList.add(tmpStr.trim());
}
}
return linkList.toArray(new String[linkList.size()]);
}
} //End of Class
© 2015 - 2025 Weber Informatics LLC | Privacy Policy