All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.iptc.IptcAnpaParser Maven / Gradle / Ivy

There is a newer version: 1.0.18
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.iptc;

import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.Set;
import java.util.TimeZone;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Parser for IPTC ANPA New Wire Feeds
 */
public class IptcAnpaParser implements Parser {
    /** Serial version UID */
    private static final long serialVersionUID = -6062820170212879115L;

    private static final MediaType TYPE =
        MediaType.text("vnd.iptc.anpa");

    private static final Set SUPPORTED_TYPES =
        Collections.singleton(TYPE);

    public Set getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    public void parse(
           InputStream stream, ContentHandler handler,
           Metadata metadata, ParseContext context)
           throws IOException, SAXException, TikaException {

        HashMap properties = this.loadProperties(stream);
        this.setMetadata(metadata, properties);

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        // TODO: put body content here
        xhtml.startElement("p");
        String body = clean(properties.get("body"));
        if (body != null)
           xhtml.characters(body);
        xhtml.endElement("p");
        xhtml.endDocument();
    }

    /**
     * @deprecated This method will be removed in Apache Tika 1.0.
     */
    public void parse(
            InputStream stream, ContentHandler handler, Metadata metadata)
            throws IOException, SAXException, TikaException {
        parse(stream, handler, metadata, new ParseContext());
    }


   private int FMT_ANPA_1312    = 0x00;   // "NAA 89-3 (ANPA 1312)"
   private int FMT_ANPA_UPI     = 0x01;   // "United Press International ANPA 1312 variant"
   private int FMT_ANPA_UPI_DL  = 0x02;   // "United Press International Down-Load Message"
   private int FMT_IPTC_7901    = 0x03;   // "IPTC7901 Recommended Message Format"
   private int FMT_IPTC_PHOTO   = 0x04;   // "IPTC-NAA Digital Newsphoto Parameter Record"
   private int FMT_IPTC_CHAR    = 0x05;   // "IPTC Unstructured Character Oriented File Format (UCOFF)"
   private int FMT_NITF         = 0x06;   // "News Industry Text Format (NITF)"
   private int FMT_NITF_TT      = 0x07;   // "Tidningarnas Telegrambyra NITF version (TTNITF DTD)"
   private int FMT_NITF_RB      = 0x08;   // "Ritzaus Bureau NITF version (RBNITF DTD)"
   private int FMT_IPTC_AP      = 0x09;   // "Associated Press news wire format"
   private int FMT_IPTC_BLM     = 0x0A;   // "Bloomberg News news wire format"
   private int FMT_IPTC_NYT     = 0x0B;   // "New York Times news wire format"
   private int FMT_IPTC_RTR     = 0x0C;   // "Reuters news wire format"

   private int FORMAT = FMT_ANPA_1312;    // assume the default format to be ANPA-1312

   private final static char SOH = 0x01;    // start of header (ctrl-a)
   private final static char STX = 0x02;    // start of text (ctrl-b)
   private final static char ETX = 0x03;    // end of text (ctrl-c)
   private final static char EOT = 0x04;    // the tab character (ctrl-d)
   private final static char SYN = 0x16;    // synchronous idle (ctrl-v)

   private final static char BS = 0x08;    // the backspace character (used for diacriticals)
   private final static char TB = 0x09;    // the tab character
   private final static char LF = 0x0A;    // line feed
   private final static char FF = 0x0C;    // form feed
   private final static char CR = 0x0D;    // carriage return
   private final static char XQ = 0x11;    // device control (ctrl-q)
   private final static char XS = 0x13;    // device control (ctrl-s)
   private final static char FS = 0x1F;    // a field delimiter

   private final static char HY = 0x2D;    // hyphen
   private final static char SP = 0x20;    // the blank space
   private final static char LT = 0x3C;    // less than
   private final static char EQ = 0x3D;    // less than
   private final static char CT = 0x5E;    // carat

   private final static char SL = 0x91;    // single-quote left
   private final static char SR = 0x92;    // single-quote right
   private final static char DL = 0x93;    // double-quote left
   private final static char DR = 0x94;    // double-quote right


   /**
    * scan the news messsage and store the metadata and data into a map
    */
   private HashMap loadProperties(InputStream is) {
      
      HashMap properties = new HashMap();

      FORMAT = this.scanFormat(is);

      byte[] residual = this.getSection(is,"residual");

      byte[] header = this.getSection(is,"header");
      parseHeader(header, properties);

      byte[] body = this.getSection(is,"body");
      parseBody(body, properties);

      byte[] footer = this.getSection(is,"footer");
      parseFooter(footer, properties);
       
      return (properties);
   }


   private int scanFormat(InputStream is) {
      int format    = this.FORMAT;
      int  maxsize  = 524288;     //  512K

      byte[] buf = new byte[maxsize];
      try {
         if (is.markSupported()) {
            is.mark(maxsize);
         }
         int msgsize = is.read(buf);                // read in at least the full data

         String message = (new String(buf, UTF_8)).toLowerCase(Locale.ROOT);
         // these are not if-then-else, because we want to go from most common
         // and fall through to least.  this is imperfect, as these tags could
         // show up in other agency stories, but i can't find a spec or any
         // explicit codes to identify the wire source in the message itself

         if (message.contains("ap-wf")) {
            format = this.FMT_IPTC_AP;
         }
         if (message.contains("reuters")) {
            format = this.FMT_IPTC_RTR;
         }
         if (message.contains("new york times")) {
            format = this.FMT_IPTC_NYT;
         }
         if (message.contains("bloomberg news")) {
            format = this.FMT_IPTC_BLM;
         }
      }
      catch (IOException eio) {
         // we are in an unstable state
      }

      try {
         if (is.markSupported()) {
            is.reset();
         }
      }
      catch (IOException eio) {
         // we are in an unstable state
      }
      return (format);
   }


   private void setFormat(int format) {
      this.FORMAT = format;
   }


   private String getFormatName() {
      
      String name = "";
      
      if (FORMAT == this.FMT_IPTC_AP) {
         name = "Associated Press";
      }
      
      else if(FORMAT == this.FMT_IPTC_BLM) {
         name = "Bloomberg";
      }

      else if(FORMAT == this.FMT_IPTC_NYT) {
         name = "New York Times";
      }

      else if(FORMAT == this.FMT_IPTC_RTR) {
         name = "Reuters";
      }

      return (name);
   }


   private byte[] getSection(InputStream is, String name) {

      byte[] value = new byte[0];

      if (name.equals("residual")) {
         // the header shouldn't be more than 1k, but just being generous here
         int  maxsize  = 8192;     //  8K
         byte bstart   = SYN;     // check for SYN [0x16 : ctrl-v] (may have leftover residue from preceding message)
         byte bfinish  = SOH;     // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN [0x16 : ctrl-v])
         value = getSection(is, maxsize, bstart, bfinish, true);
      }

      else if(name.equals("header")) {
         // the header shouldn't be more than 1k, but just being generous here
         int  maxsize  = 8192;     //  8K
         byte bstart   = SOH;     // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN [0x16 : ctrl-v])
         byte bfinish  = STX;     // check for STX [0x02 : ctrl-b] (marks end of header, beginning of message)
         value = getSection(is, maxsize, bstart, bfinish, true);
      }

      else if (name.equals("body")) {
         // the message shouldn't be more than 16k (?), leaving plenty of space
         int  maxsize  = 524288;     //  512K
         byte bstart   = STX;     // check for STX [0x02 : ctrl-b] (marks end of header, beginning of message)
         byte bfinish  = ETX;     // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of footer)
         value = getSection(is, maxsize, bstart, bfinish, true);
      }

      else if (name.equals("footer")) {
         // the footer shouldn't be more than 1k , leaving plenty of space
         int maxsize   = 8192;     //  8K
         byte bstart   = ETX;     // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of footer)
         byte bfinish  = EOT;     // check for EOT [0x04 : ctrl-d] (marks end of transmission)
         value = getSection(is, maxsize, bstart, bfinish, true);
      }

      return (value);
   }


   private byte[] getSection(InputStream is, int maxsize, byte bstart, byte bfinish, boolean ifincomplete) {
      byte[] value  = new byte[0];

      try {
         boolean started = false;                   // check if we have found the start flag
         boolean finished = false;                  // check if we have found the finish flag
         int read = 0;                              // the number of bytes we read
         int start = 0;                             // the position after the start flag

         // TODO: this only pulls back 8K of data on a read, regardless of buffer size
         //       more nefariously, it caps at a total 8K, through all sections
         int streammax = is.available();
         maxsize = Math.min(maxsize, streammax);

         is.mark(maxsize);
         byte[] buf = new byte[maxsize];
         int totsize = 0;
         int remainder = maxsize - totsize;
         while (remainder > 0) {
            int msgsize = is.read(buf, maxsize-remainder, maxsize);    // read in at least the full data
            if (msgsize == -1) {
               remainder = msgsize = 0;
            }
            remainder -= msgsize;
            totsize   += msgsize;
         }

         // scan through the provided input stream
         for (read=0; read < totsize; read++) {
            byte b = buf[read];

            if (!started) {
               started = (b == bstart);
               start = read + 1;
               continue;
            }

            if (finished = (b == bfinish)) {
/*
               is.reset();
               long skipped = is.skip((long)read);
               if (skipped != read) {
                  // we are in an unstable state
               }
               is.mark(1);
 */
               break;
            }

            // load from the stream until we run out of characters, or hit the termination byte
            continue;
         }

         // move the input stream back to where it was initially
         is.reset();

         if (finished) {
            // now, we want to reset the stream to be sitting right on top of the finish marker
            is.skip(read);
            value = new byte[read-start];
            System.arraycopy(buf, start, value, 0, read-start);
         }
         else {
            if (ifincomplete && started) {
               // the caller wants anything that was read, and we finished the stream or buffer
               value = new byte[read-start];
               System.arraycopy(buf, start, value, 0, read-start);
            }
         }
      }
      catch (IOException eio) {
         // something invalid occurred, return an empty string
      }

      return (value);
   }


   private boolean parseHeader(byte[] value, HashMap properties) {
      boolean added = false;

      String env_serviceid = "";
      String env_category = "";
      String env_urgency = "";
      String hdr_edcode = "";
      String hdr_subject = "";
      String hdr_date = "";
      String hdr_time = "";

      int read = 0;

      while (read < value.length) {

         // pull apart the envelope, getting the service id  (....\x1f)
         while (read < value.length) {
            byte val_next = value[read++];
            if (val_next != FS) {
               env_serviceid += (char)(val_next & 0xff);  // convert the byte to an unsigned int
            }
            else {
               break;
            }
         }

         // pull apart the envelope, getting the category  (....\x13\x11)
         while (read < value.length) {
            byte val_next = value[read++];
            if (val_next != XS) {   // the end of the envelope is marked (\x13)
               env_category += (char)(val_next & 0xff);  // convert the byte to an unsigned int
            }
            else {
               val_next = value[read];  // get the remaining byte (\x11)
               if (val_next == XQ) {
                  read++;
               }
               break;
            }
         }

         // pull apart the envelope, getting the subject heading
         while (read < value.length) {
            boolean subject = true;
            byte val_next = value[read++];
            while ((subject) && (val_next != SP) && (val_next != 0x00)) {  // ignore the envelope subject
               hdr_subject += (char)(val_next & 0xff);  // convert the byte to an unsigned int
               val_next =  (read < value.length) ? value[read++] : 0x00;
               while (val_next == SP) {  // consume all the spaces
                  subject = false;
                  val_next =  (read < value.length) ? value[read++] : 0x00;
                  if (val_next != SP) {
                     --read;  // otherwise we eat into the next section
                  }
               }
            }
            if (!subject) {
               break;
            }
         }

         // pull apart the envelope, getting the date and time
         while (read < value.length) {
            byte val_next = value[read++];
            if (hdr_date.length() == 0) {
               while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39))  // consume all numerics and hyphens
                  ||   (val_next == HY)) {
                  hdr_date += (char)(val_next & 0xff);  // convert the byte to an unsigned int
                  val_next =  (read < value.length) ? value[read++] : 0x00;
               }
            }
            else if (val_next == SP) {
               while (val_next == SP) {  // consume all the spaces
                  val_next =  (read < value.length) ? value[read++] : 0x00;
               }
               continue;
            }
            else {
               while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39))  // consume all numerics and hyphens
                  ||   (val_next == HY)) {
                  hdr_time += (char)(val_next & 0xff);  // convert the byte to an unsigned int
                  val_next =  (read < value.length) ? value[read++] : 0x00;
               }
            }
         }
         break; // don't let this run back through and start thrashing metadata
      }

      // if we were saving any of these values, we would set the properties map here

      added = (env_serviceid.length() + env_category.length() + hdr_subject.length() + 
               hdr_date.length() + hdr_time.length()) > 0; 
      return added;
   }

   private boolean parseBody(byte[] value, HashMap properties) {
      boolean added = false;

      String bdy_heading = "";
      String bdy_title = "";
      String bdy_source = "";
      String bdy_author = "";
      String bdy_body = "";

      int read = 0;
      boolean done = false;

      while (!done && (read < value.length)) {

         // pull apart the body, getting the heading (^....\x0d\x0a)
         while (read < value.length) {
            byte val_next = value[read++];
            if (val_next == CT) {      //  start of a new section , first is the heading
               val_next =  (read < value.length) ? value[read++] : 0x00;
               // AP, NYT, and Bloomberg end with < , Reuters with EOL
               while ((val_next != LT) && (val_next != CR) && (val_next != LF)) {   // less than delimiter (\x3c) and not EOL
                  bdy_heading += (char)(val_next & 0xff);  // convert the byte to an unsigned int
                  val_next =  (read < value.length) ? value[read++] : 0x00;
                  if (read > value.length) { break; }  // shouldn't ever hit this, but save a NPE
               }
               if (val_next == LT) {
                  // hit the delimiter, carry on
                  val_next =  (read < value.length) ? value[read++] : 0x00;
               }
               while (bdy_heading.length() > 0 && ((val_next == CR) || (val_next == LF))) {
                  val_next =  (read < value.length) ? value[read++] : 0x00;  // skip the new lines
                  if ((val_next != CR) && (val_next != LF)) {
                     --read;
                  }
               }
            }
            else {
               // this will only be hit on poorly-formed files

               // for reuters, the heading does not start with the ^, so we push one back into the stream
               if (FORMAT == this.FMT_IPTC_RTR) {
                  if (val_next != CT) {
                     // for any non-whitespace, we need to go back an additional step to non destroy the data
                     if ((val_next != SP) && (val_next != TB) && (val_next != CR) && (val_next != LF)) {
                        // if the very first byte is data, we have to shift the whole array, and stuff in a carat
                        if (read == 1) {
                           byte[] resize = new byte[value.length + 1];
                           System.arraycopy(value, 0, resize, 1, value.length);
                           value = resize;
                        }
                     }
                     value[--read] = CT;
                     continue;
                  }
               }
            }
            break;
         }

         // pull apart the body, getting the title (^....\x0d\x0a)
         while (read < value.length) {
            byte val_next = value[read++];
            if (val_next == CT) {      //  start of a new section , first is the heading
               val_next =  (read < value.length) ? value[read++] : 0x00;
               // AP, NYT, and Bloomberg end with < , Reuters with EOL
               while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next != LF)) {   // less than delimiter (\x3c), or carat (\x5e) and not EOL
                  bdy_title += (char)(val_next & 0xff);  // convert the byte to an unsigned int
                  val_next =  (read < value.length) ? value[read++] : 0x00;
                  if (read > value.length) { break; }  // shouldn't ever hit this, but save a NPE
               }

               if (val_next == CT) {      //  start of a new section , when first didn't finish cleanly
                   --read;
               }

               if (val_next == LT) {
                  // hit the delimiter, carry on
                  val_next =  (read < value.length) ? value[read++] : 0x00;
               }

               while (bdy_title.length() > 0 && ((val_next == CR) || (val_next == LF))) {
                  val_next =  (read < value.length) ? value[read++] : 0x00;  // skip the new lines
                  if ((val_next != CR) && (val_next != LF)) {
                     --read;
                  }
               }
            }
            else {
               // this will only be hit on poorly-formed files

               // for bloomberg, the title does not start with the ^, so we push one back into the stream
               if (FORMAT == this.FMT_IPTC_BLM) {
                  if (val_next == TB) {
                     value[--read] = CT;
                     continue;
                  }
               }

               // for reuters, the title does not start with the ^, so we push one back into the stream
               if (FORMAT == this.FMT_IPTC_RTR) {
                  if (val_next != CT) {
                     // for any non-whitespace, we need to go back an additional step to non destroy the data
                     if ((val_next != SP) && (val_next != TB) && (val_next != CR) && (val_next != LF)) {
                        --read;
                     }
                     value[--read] = CT;
                     continue;
                  }
               }
            }
            break;
         }


         // at this point, we have a variable number of metadata lines, with various orders
         // we scan the start of each line for the special character, and run to the end character
         // pull apart the body, getting the title (^....\x0d\x0a)
         boolean metastarted = false;
         String longline = "";
         String longkey = "";
         while (read < value.length) {
            byte val_next = value[read++];

            // eat up whitespace before committing to the next section
            if ((val_next == SP) || (val_next == TB) || (val_next == CR) || (val_next == LF)) {
               continue;
            }

            if (val_next == CT) {      //  start of a new section , could be authors, sources, etc
               val_next =  (read < value.length) ? value[read++] : 0x00;
               String tmp_line = "";
               while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next != LF) && (val_next != 0))  {
                  // less than delimiter (\x3c), maybe also badly formed with just new line
                  tmp_line += (char)(val_next & 0xff);  // convert the byte to an unsigned int
                  val_next =  (read < value.length) ? value[read++] : 0x00;
                  if (read > value.length) { break; }  // shouldn't ever hit this, but save a NPE
               }

               if (val_next == CT) {      //  start of a new section , when first didn't finish cleanly
                   --read;
               }

               if (val_next == LT) {
                  // hit the delimiter, carry on
                  val_next =  (read < value.length) ? value[read++] : 0x00;
               }

               while ((val_next == CR) || (val_next == LF)) {
                  val_next =  (read < value.length) ? value[read++] : 0x00;  // skip the new lines
                  if ((val_next != CR) && (val_next != LF)) {
                     --read;
                  }
               }
               if (tmp_line.toLowerCase(Locale.ROOT).startsWith("by") || longline.equals("bdy_author")) {
                  longkey = "bdy_author";

                  // prepend a space to subsequent line, so it gets parsed consistent with the lead line
                  tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;

                  // we have an author candidate
                  int term = tmp_line.length();
                  term = Math.min(term, (tmp_line.contains("<") ? tmp_line.indexOf("<")  : term));
                  term = Math.min(term, (tmp_line.contains("=") ? tmp_line.indexOf("=")  : term));
                  term = Math.min(term, (tmp_line.contains("\n") ? tmp_line.indexOf("\n") : term));
                  term = (term > 0 ) ? term : tmp_line.length();
                  bdy_author += tmp_line.substring(tmp_line.indexOf(" "), term);
                  metastarted = true;
                  longline = ((tmp_line.contains("=")) && (!longline.equals(longkey)) ? longkey : "");
               }
               else if (FORMAT == this.FMT_IPTC_BLM) {
                  String byline = "   by ";
                  if (tmp_line.toLowerCase(Locale.ROOT).contains(byline)) {
                     longkey = "bdy_author";

                     int term = tmp_line.length();
                     term = Math.min(term, (tmp_line.contains("<") ? tmp_line.indexOf("<")  : term));
                     term = Math.min(term, (tmp_line.contains("=") ? tmp_line.indexOf("=")  : term));
                     term = Math.min(term, (tmp_line.contains("\n") ? tmp_line.indexOf("\n") : term));
                     term = (term > 0 ) ? term : tmp_line.length();
                     // for bloomberg, the author line sits below their copyright statement
                     bdy_author += tmp_line.substring(tmp_line.toLowerCase(Locale.ROOT).indexOf(byline) + byline.length(), term) + " ";
                     metastarted = true;
                     longline = ((tmp_line.contains("=")) && (!longline.equals(longkey)) ? longkey : "");
                  }
                  else if(tmp_line.toLowerCase(Locale.ROOT).startsWith("c.")) {
                     // the author line for bloomberg is a multiline starting with c.2011 Bloomberg News
                     // then containing the author info on the next line
                     if (val_next == TB) {
                        value[--read] = CT;
                        continue;
                     }
                  }
                  else if(tmp_line.toLowerCase(Locale.ROOT).trim().startsWith("(") && tmp_line.toLowerCase(Locale.ROOT).trim().endsWith(")")) {
                     // the author line may have one or more comment lines between the copyright
                     // statement, and the By AUTHORNAME line
                     if (val_next == TB) {
                        value[--read] = CT;
                        continue;
                     }
                  }
               }

               else if (tmp_line.toLowerCase(Locale.ROOT).startsWith("eds") || longline.equals("bdy_source")) {
                  longkey = "bdy_source";
                  // prepend a space to subsequent line, so it gets parsed consistent with the lead line
                  tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;

                  // we have a source candidate
                  int term = tmp_line.length();
                  term = Math.min(term, (tmp_line.contains("<") ? tmp_line.indexOf("<")  : term));
                  term = Math.min(term, (tmp_line.contains("=") ? tmp_line.indexOf("=")  : term));
//                  term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? tmp_line.indexOf("\n") : term));
                  term = (term > 0 ) ? term : tmp_line.length();
                  bdy_source += tmp_line.substring(tmp_line.indexOf(" ") + 1, term) + " ";
                  metastarted = true;
                  longline = (!longline.equals(longkey) ? longkey  : "");
               }
               else {
                  // this has fallen all the way through.  trap it as part of the subject,
                  // rather than just losing it
                  if (!metastarted) {
                     bdy_title += " , " + tmp_line;     //  not sure where else to put this but in the title
                  }
                  else {
                     // what to do with stuff that is metadata, which falls after metadata lines started?
                     bdy_body += " " + tmp_line + " , ";     //  not sure where else to put this but in the title
                  }
               }
            }
            else {  // we're on to the main body
               while ((read < value.length) && (val_next != 0))  {
                  // read until the train runs out of tracks
                  bdy_body += (char)(val_next & 0xff);  // convert the byte to an unsigned int
                  val_next =  (read < value.length) ? value[read++] : 0x00;
                  if (read > value.length) { break; }  // shouldn't ever hit this, but save a NPE
               }

            }
            // we would normally break here, but just let this read out to the end
         }
         done = true; // don't let this run back through and start thrashing metadata
      }
      properties.put("body", bdy_body);
      properties.put("title", bdy_title);
      properties.put("subject", bdy_heading);
      properties.put("author", bdy_author);
      properties.put("source", bdy_source);

      added = (bdy_body.length() + bdy_title.length() + bdy_heading.length() + bdy_author.length() +
               bdy_source.length()) > 0;
      return added;
   }


   private boolean parseFooter(byte[] value, HashMap properties) {
      boolean added = false;

      String ftr_source = "";
      String ftr_datetime = "";

      int read = 0;
      boolean done = false;

      while (!done && (read < value.length)) {

         // pull apart the footer, getting the news feed source (^....\x0d\x0a)
         byte val_next = value[read++];
         byte val_peek =  (read < value.length) ? value[read+1] : 0x00;  // skip the new lines

         while (((val_next < (byte)0x30) || (val_next > (byte)0x39)) && (val_next != 0)) {  // consume all non-numerics first
            ftr_source += (char)(val_next & 0xff);  // convert the byte to an unsigned int
            val_next =  (read < value.length) ? value[read] : 0x00;  // attempt to read until end of stream
            read++;
            if (read > value.length) { break; }  // shouldn't ever hit this, but save a NPE
         }

         while ((val_next != LT) && (val_next != CR) && (val_next != LF) && (val_next != 0))  {  // get as much timedate as possible
            // this is an american format, so arrives as mm-dd-yy HHiizzz
            ftr_datetime += (char)(val_next & 0xff);  // convert the byte to an unsigned int
            val_next =  (read < value.length) ? value[read++] : 0x00;  // skip the new lines
            if (read > value.length) { break; }  // shouldn't ever hit this, but save a NPE
         }
         if (val_next == LT) {
            // hit the delimiter, carry on
            val_next =  (read < value.length) ? value[read++] : 0x00;
         }

         if (ftr_datetime.length() > 0) {
            // we want to pass this back in a more friendly format
            String format_out = "yyyy-MM-dd'T'HH:mm:ss'Z'";
            Date dateunix = new Date();
            try {
               // standard ap format
               String format_in = "MM-dd-yy HHmmzzz";

               if (FORMAT == this.FMT_IPTC_RTR) {
                  // standard reuters format
                  format_in = "HH:mm MM-dd-yy";
               }
               SimpleDateFormat dfi = new SimpleDateFormat(format_in, Locale.ROOT);
               dfi.setTimeZone(TimeZone.getTimeZone("UTC"));
               dateunix = dfi.parse(ftr_datetime);
            }
            catch (ParseException ep) {
               // failed, but this will just fall through to setting the date to now
            }
            SimpleDateFormat dfo = new SimpleDateFormat(format_out, Locale.ROOT);
            dfo.setTimeZone(TimeZone.getTimeZone("UTC"));
            ftr_datetime = dfo.format(dateunix);
         }
         while ((val_next == CR) || (val_next == LF)) {
            val_next =  (read < value.length) ? value[read++] : 0x00;  // skip the new lines
            if ((val_next != CR) && (val_next != LF)) {
               --read;
            }
         }
         done = true; // don't let this run back through and start thrashing metadata
      }

      properties.put("publisher", ftr_source);
      properties.put("created", ftr_datetime);
      properties.put("modified", ftr_datetime);

      added = (ftr_source.length() + ftr_datetime.length()) > 0; 
      return added;
   }


   private void setMetadata(Metadata metadata, HashMap properties) {

      // every property that gets set must be non-null, or it will cause NPE
      // in other consuming applications, like Lucene
      metadata.set(Metadata.CONTENT_TYPE,  clean("text/anpa-1312"));
      metadata.set(TikaCoreProperties.TITLE,         clean(properties.get("title")));
      metadata.set(TikaCoreProperties.KEYWORDS,       clean(properties.get("subject")));
      metadata.set(TikaCoreProperties.CREATOR,        clean(properties.get("author")));
      metadata.set(TikaCoreProperties.CREATED, clean(properties.get("created")));
      metadata.set(TikaCoreProperties.MODIFIED,      clean(properties.get("modified")));
      metadata.set(TikaCoreProperties.SOURCE,      clean(properties.get("source")));
//      metadata.set(TikaCoreProperties.PUBLISHER,     clean(properties.get("publisher")));
      metadata.set(TikaCoreProperties.PUBLISHER,     clean(this.getFormatName()));

/*
        metadata.set(TikaCoreProperties.DATE, font.getHeader().getCreated().getTime());
        metadata.set(
                Property.internalDate(TikaCoreProperties.MODIFIED),
                font.getHeader().getModified().getTime());
*/
   }

   private String clean(String value) {
      if (value == null) {
         value = "";
      }

      value = value.replaceAll("``", "`");
      value = value.replaceAll("''", "'");
      value = value.replaceAll(new String(new char[] {SL}), "'");
      value = value.replaceAll(new String(new char[] {SR}), "'");
      value = value.replaceAll(new String(new char[] {DL}), "\"");
      value = value.replaceAll(new String(new char[] {DR}), "\"");
      value = value.trim();

      return (value);
   }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy