All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.email.EmailDocumentHandler Maven / Gradle / Ivy

/*
 *  EmailDocumentHandler.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Cristian URSU,  3/Aug/2000
 *
 *  $Id: EmailDocumentHandler.java 17854 2014-04-17 13:44:42Z markagreenwood $
 */

package gate.email;

import gate.Factory;
import gate.FeatureMap;
import gate.GateConstants;
import gate.event.StatusListener;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

/**
  * This class implements the behaviour of the Email reader
  * It takes the Gate Document representing a list with e-mails and
  * creates Gate annotations on it.
  */
public class EmailDocumentHandler {

  private String content = null;
  private long documentSize = 0;

  /**
    * Constructor used in tests mostly
    */
  public EmailDocumentHandler() {
    setUp();
  }//EmailDocumentHandler

  /**
    * Constructor initialises some private fields
    */
  public EmailDocumentHandler( gate.Document aGateDocument,
                               Map  aMarkupElementsMap,
                               Map  anElement2StringMap
                              ) {

    gateDocument = aGateDocument;

    // gets AnnotationSet based on the new gate document
    if (basicAS == null)
      basicAS = gateDocument.getAnnotations(
                                GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);

    markupElementsMap = aMarkupElementsMap;
    element2StringMap = anElement2StringMap;
    setUp();
  }// EmailDocumentHandler

  /**
    * Reads the Gate Document line by line and does the folowing things:
    * 
    *
  • Each line is analized in order to detect where an e-mail starts. *
  • If the line belongs to an e-mail header then creates the * annotation if the markupElementsMap allows that. *
  • Lines belonging to the e-mail body are placed under a Gate * annotation called messageBody. *
*/ public void annotateMessages() throws IOException, gate.util.InvalidOffsetException { // obtain a BufferedReader form the Gate document... BufferedReader gateDocumentReader = null; // Get the string representing the content of the document // It is used inside CreateAnnotation method content = gateDocument.getContent().toString(); // Get the sieze of the Gate Document. For the same purpose. documentSize = gateDocument.getContent().size().longValue(); // gateDocumentReader = new BufferedReader(new InputStreamReader( // gateDocument.getSourceUrl().openConnection().getInputStream())); gateDocumentReader = new BufferedReader(new StringReader(content)); // for each line read from the gateDocumentReader do // if the line begins an e-mail message then fire a status listener, mark // that we are processing an e-mail, update the cursor and go to the next // line. // if we are inside an e-mail, test if the line belongs to the message // header // if so, create a header field annotation. // if we are inside a a body and this is the first line from the body, // create the message body annotation. // Otherwise just update the cursor and go to the next line // if the line doesn't belong to an e-mail message then just update the // cursor. // next line String line = null; String aFieldName = null; long cursor = 0; long endEmail = 0; long startEmail = 0; long endHeader = 0; long startHeader = 0; long endBody = 0; long startBody = 0; long endField = 0; long startField = 0; boolean insideAnEmail = false; boolean insideHeader = false; boolean emailReadBefore = false; boolean fieldReadBefore = false; long nlSize = detectNLSize(); //Out.println("NL SIZE = " + nlSize); // read each line from the reader while ((line = gateDocumentReader.readLine()) != null){ // Here we test if the line delimitates two e-mail messages. // Each e-mail message begins with a line like this: // From P.Fairhurst Thu Apr 18 12:22:23 1996 // Method lineBeginsMessage() detects such lines. if (lineBeginsMessage(line)){ // Inform the status listener to fire only // if no. of elements processed. // So far is a multiple of ELEMENTS_RATE if ((++ emails % EMAILS_RATE) == 0) fireStatusChangedEvent("Reading emails : " + emails); // if there are e-mails read before, then the previous e-mail // ends here. if (true == emailReadBefore){ // Cursor points at the beggining of the line // E-mail and Body ends before the \n char // Email ends as cursor value indicates endEmail = cursor - nlSize ; // also the e-mail body ends when an e-mail ends endBody = cursor - nlSize; //Annotate an E-mail body (startBody, endEmail) createAnnotation("Body",startBody,endBody,null); //Annotate an E-mail message(startEmail, endEmail) Email starts createAnnotation("Message",startEmail,endEmail,null); } // if no e-mail was read before, now there is at list one message // read emailReadBefore = true; // E-mail starts imediately from the beginning of this line which // sepatates 2 messages. startEmail = cursor; // E-mail header starts also from here startHeader = cursor; // The cursor is updated with the length of the line + the // new line char cursor += line.length() + nlSize; // We are inside an e-mail insideAnEmail = true; // Next is the E-mail header insideHeader = true; // No field inside header has been read before fieldReadBefore = false; // Read the next line continue; }//if (lineBeginsMessage(line)) if (false == insideAnEmail){ // the cursor is update with the length of the line + // the new line char cursor += line.length() + nlSize; // read the next line continue; }//if // here we are inside an e-mail message (inside Header or Body) if (true == insideHeader){ // E-mail spec sais that E-mail header is separated by E-mail body // by a \n char if (line.equals("")){ // this \n sepatates the header of an e-mail form its body // If we are here it means that the header has ended. insideHeader = false; // e-mail header ends here endHeader = cursor - nlSize; // update the cursor with the length of \n cursor += line.length() + nlSize; // E-mail body starts from here startBody = cursor; // if fields were read before, it means that the e-mail has a header if (true == fieldReadBefore){ endField = endHeader; //Create a field annotation (fieldName, startField, endField) createAnnotation(aFieldName, startField, endField, null); //Create an e-mail header annotation createAnnotation("Header",startHeader,endHeader,null); }//if // read the next line continue; }//if (line.equals("")) // if line begins with a field then prepare to create an // annotation with the name of the field if (lineBeginsWithField(line)){ // if a field was read before, it means that the previous field ends // here if (true == fieldReadBefore){ // the previous field end here endField = cursor - nlSize; //Create a field annotation (fieldName, startField, endField) createAnnotation(aFieldName, startField, endField, null); }//if fieldReadBefore = true; aFieldName = getFieldName(); startField = cursor + aFieldName.length() + ":".length(); }//if // in both cases the cursor is updated and read the next line // the cursor is update with the length of the line + // the new line char cursor += line.length() + nlSize; // read the next line continue; }//if (true == insideHeader) // here we are inside the E-mail body // the body will end when the e-mail will end. // here we just update the cursor cursor += line.length() + nlSize; }//while // it might be possible that the file to contain only one e-mail and // if the file contains only one e-mail message then the variable // emailReadBefore must be set on true value if (true == emailReadBefore){ endBody = cursor - nlSize; endEmail = cursor - nlSize; //Annotate an E-mail body (startBody, endEmail) createAnnotation("Body",startBody,endBody,null); //Annotate an E-mail message(startEmail, endEmail) Email starts createAnnotation("Message",startEmail,endEmail,null); } // if emailReadBefore is not set on true, that means that we didn't // encounter any line like this: // From P.Fairhurst Thu Apr 18 12:22:23 1996 }//annotateMessages /** * This method detects if the text file which contains e-mail messages * is under MSDOS or UNIX format. * Under MSDOS the size of NL is 2 (\n \r) and under UNIX (\n) the size is 1 * @return the size of the NL (1,2 or 0 = if no \n is found) */ private int detectNLSize() { // get a char array char[] document = null; // transform the gate Document into a char array document = gateDocument.getContent().toString().toCharArray(); // search for the \n char // when it is found test if is followed by the \r char for (int i=0; i= 0) && (document[i-1] == '\r')) ) return 2; else return 1; } } //if no \n char is found then the document is contained into a single text // line. return 0; } // detectNLSize /** * This method creates a gate annotation given its name, start, end and * feature map. */ private void createAnnotation(String anAnnotationName, long anAnnotationStart, long anAnnotationEnd, FeatureMap aFeatureMap) throws gate.util.InvalidOffsetException{ /* while (Character.isWhitespace(content.charAt((int) anAnnotationStart))) anAnnotationStart ++; // System.out.println(content.charAt((int) anAnnotationEnd)); while (Character.isWhitespace(content.charAt((int) anAnnotationEnd))) anAnnotationEnd --; anAnnotationEnd ++; */ if (canCreateAnnotation(anAnnotationStart,anAnnotationEnd,documentSize)){ if (aFeatureMap == null) aFeatureMap = Factory.newFeatureMap(); basicAS.add( new Long(anAnnotationStart), new Long(anAnnotationEnd), anAnnotationName.toLowerCase(), aFeatureMap); }// End if }//createAnnotation /** * This method verifies if an Annotation can be created. */ private boolean canCreateAnnotation(long start, long end, long gateDocumentSize){ if (start < 0 || end < 0 ) return false; if (start > end ) return false; if ((start > gateDocumentSize) || (end > gateDocumentSize)) return false; return true; }// canCreateAnnotation /** * Tests if the line begins an e-mail message * @param aTextLine a line from the file containing the e-mail messages * @return true if the line begins an e-mail message * @return false if is doesn't */ protected boolean lineBeginsMessage(String aTextLine){ int score = 0; // if first token is "From" and the rest contains Day, Zone, etc // then this line begins a message // create a new String Tokenizer with " " as separator StringTokenizer tokenizer = new StringTokenizer(aTextLine," "); // get the first token String firstToken = null; if (tokenizer.hasMoreTokens()) firstToken = tokenizer.nextToken(); else return false; // trim it firstToken = firstToken.trim(); // check against "From" word // if the first token is not From then the entire line can not begin // a message. if (!firstToken.equals("From")) return false; // else continue the analize while (tokenizer.hasMoreTokens()){ // get the next token String token = tokenizer.nextToken(); token = token.trim(); // see if it has a meaning(analize if is a Day, Month,Zone, Time, Year ) if (hasAMeaning(token)) score += 1; } // a score greather or equql with 5 means that this line begins a message if (score >= 5) return true; else return false; } // lineBeginsMessage /** * Tests if the line begins with a field from the e-mail header * If the answer is true then it also sets the member fieldName with the * value of this e-mail header field. * @param aTextLine a line from the file containing the e-mail text * @return true if the line begins with a field from the e-mail header * @return false if is doesn't */ protected boolean lineBeginsWithField(String aTextLine){ if (containsSemicolon(aTextLine)){ StringTokenizer tokenizer = new StringTokenizer(aTextLine,":"); // get the first token String firstToken = null; if (tokenizer.hasMoreTokens()) firstToken = tokenizer.nextToken(); else return false; if (firstToken != null){ // trim it firstToken = firstToken.trim(); if (containsWhiteSpaces(firstToken)) return false; // set the member field fieldName = firstToken; } return true; } else return false; } // lineBeginsWithField /** * This method checks if a String contains white spaces. */ protected boolean containsWhiteSpaces(String aString) { for (int i = 0; i 0) && (number < 32)) return true; // if is a number between 1900 si 3000 then is a year ;)) if ((number > 1900) && (number < 3000)) return true; // it might be the last two digits of 19xx if ((number >= 0) && (number <= 99)) return true; } // test if is time: hh:mm:ss if (isTime(aToken)) return true; return false; } // hasAMeaning /** * Tests a token if is in time format HH:MM:SS */ protected boolean isTime(String aToken) { StringTokenizer st = new StringTokenizer(aToken,":"); // test each token if is hour, minute or second String hourString = null; if (st.hasMoreTokens()) hourString = st.nextToken(); // if there are no more tokens, it means that is not a time if (hourString == null) return false; // test if is a number between 0 and 23 Integer hourInteger = null; try{ hourInteger = new Integer(hourString); } catch (NumberFormatException e){ hourInteger = null; } if (hourInteger == null) return false; // if is not null then it means is a number // test if is in 0 - 23 range // if is not in this range then is not an hour int hour = hourInteger.intValue(); if ( (hour < 0) || (hour > 23) ) return false; // we have the hour // now repeat the test for minute and seconds // minutes String minutesString = null; if (st.hasMoreTokens()) minutesString = st.nextToken(); // if there are no more tokens (minutesString == null) then return false if (minutesString == null) return false; // test if is a number between 0 and 59 Integer minutesInteger = null; try { minutesInteger = new Integer (minutesString); } catch (NumberFormatException e){ minutesInteger = null; } if (minutesInteger == null) return false; // if is not null then it means is a number // test if is in 0 - 59 range // if is not in this range then is not a minute int minutes = minutesInteger.intValue(); if ( (minutes < 0) || (minutes > 59) ) return false; // seconds String secondsString = null; if (st.hasMoreTokens()) secondsString = st.nextToken(); // if there are no more tokens (secondsString == null) then return false if (secondsString == null) return false; // test if is a number between 0 and 59 Integer secondsInteger = null; try { secondsInteger = new Integer (secondsString); } catch (NumberFormatException e){ secondsInteger = null; } if (secondsInteger == null) return false; // if is not null then it means is a number // test if is in 0 - 59 range // if is not in this range then is not a minute int seconds = secondsInteger.intValue(); if ( (seconds < 0) || (seconds > 59) ) return false; // if there are more tokens in st it means that we don't have this format: // HH:MM:SS if (st.hasMoreTokens()) return false; // if we are here it means we have a time return true; }// isTime /** * Initialises the collections with data used by method lineBeginsMessage() */ private void setUp(){ day = new HashSet(); day.add("Mon"); day.add("Tue"); day.add("Wed"); day.add("Thu"); day.add("Fri"); day.add("Sat"); day.add("Sun"); month = new HashSet(); month.add("Jan"); month.add("Feb"); month.add("Mar"); month.add("Apr"); month.add("May"); month.add("Jun"); month.add("Jul"); month.add("Aug"); month.add("Sep"); month.add("Oct"); month.add("Nov"); month.add("Dec"); zone = new HashSet(); zone.add("UT"); zone.add("GMT"); zone.add("EST"); zone.add("EDT"); zone.add("CST"); zone.add("CDT"); zone.add("MST"); zone.add("MDT"); zone.add("PST"); zone.add("PDT"); }//setUp /** * This method returns the value of the member fieldName. * fieldName is set by the method lineBeginsWithField(String line). * Each time the the line begins with a field name, that fiels will be stored * in this member. */ private String getFieldName() { if (fieldName == null) return new String(""); else return fieldName; } // getFieldName // StatusReporter Implementation /** * This methos is called when a listener is registered with this class */ public void addStatusListener(StatusListener listener){ myStatusListeners.add(listener); } /** * This methos is called when a listener is removed */ public void removeStatusListener(StatusListener listener){ myStatusListeners.remove(listener); } /** * This methos is called whenever we need to inform the listener * about an event. */ protected void fireStatusChangedEvent(String text){ Iterator listenersIter = myStatusListeners.iterator(); while(listenersIter.hasNext()) listenersIter.next().statusChanged(text); } private static final int EMAILS_RATE = 16; // a gate document private gate.Document gateDocument = null; // an annotation set used for creating annotation reffering the doc private gate.AnnotationSet basicAS = null; // this map marks the elements that we don't want to create annotations @SuppressWarnings("unused") private Map markupElementsMap = null; // this map marks the elements after we want to insert some strings @SuppressWarnings("unused") private Map element2StringMap = null; // listeners for status report protected List myStatusListeners = new LinkedList(); // this reports the the number of emails that have beed processed so far private int emails = 0; // this is set by the method lineBeginsWithField(String line) // each time the the line begins with a field name, that fiels will be stored // in this member. private String fieldName = null; private Collection day = null; private Collection month = null; private Collection zone = null; } //EmailDocumentHandler




© 2015 - 2024 Weber Informatics LLC | Privacy Policy