edu.stanford.nlp.international.arabic.process.ArabicDocumentReaderAndWriter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.international.arabic.process; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
import edu.stanford.nlp.objectbank.LineIterator;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.SerializableFunction;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;

/**
 * Reads newline delimited UTF-8 Arabic sentences with or without
 * gold segmentation markers. When segmentation markers are present,
 * this class may be used for
 *
 * @author Spence Green
 */
public class ArabicDocumentReaderAndWriter implements DocumentReaderAndWriter  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ArabicDocumentReaderAndWriter.class);

  private static final long serialVersionUID = 3667837672769424178L;

  private final IteratorFromReaderFactory> factory;

  private final TokenizerFactory tf;

  // The segmentation marker used in the ATBv3 training data.
  private static final Character DEFAULT_SEG_MARKER = '-';

  private final Character segMarker;

  // TODO(spenceg): Make this configurable.
  private static final String tagDelimiter = "|||";
  private static final String rewriteDelimiter = ">>>";

  private final boolean inputHasTags;
  private final boolean inputHasDomainLabels;
  private final String inputDomain;
  private final boolean shouldStripRewrites;

  public static class RewrittenArabicAnnotation implements CoreAnnotation {
    public Class getType() {
      return String.class;
    }
  }

  /**
   *
   * @param hasSegMarkers if true, input has segmentation markers
   */
  public ArabicDocumentReaderAndWriter(boolean hasSegMarkers) {
    this(hasSegMarkers, null);
  }

  /**
   *
   * @param hasSegMarkers if true, input has segmentation markers
   * @param tokFactory a TokenizerFactory for the input
   */
  public ArabicDocumentReaderAndWriter(boolean hasSegMarkers, TokenizerFactory tokFactory) {
    this(hasSegMarkers, false, tokFactory);
  }

  /**
   *
   * @param hasSegMarkers if true, input has segmentation markers
   * @param hasTags if true, input has morphological analyses separated by tagDelimiter.
   * @param tokFactory a TokenizerFactory for the input
   */
  public ArabicDocumentReaderAndWriter(boolean hasSegMarkers,
                                       boolean hasTags,
                                       TokenizerFactory tokFactory) {
    this(hasSegMarkers, hasTags, false, "123", tokFactory);
  }
  
  /**
   *
   * @param hasSegMarkers if true, input has segmentation markers
   * @param hasTags if true, input has morphological analyses separated by tagDelimiter.
   * @param hasDomainLabels if true, input has a whitespace-terminated domain at the beginning
   *     of each line of text
   * @param tokFactory a TokenizerFactory for the input
   */
  public ArabicDocumentReaderAndWriter(boolean hasSegMarkers,
                                       boolean hasTags,
                                       boolean hasDomainLabels,
                                       String domain,
                                       TokenizerFactory tokFactory) {
    this(hasSegMarkers, hasTags, hasDomainLabels, domain, false, tokFactory);
  }
  
  /**
  *
  * @param hasSegMarkers if true, input has segmentation markers
  * @param hasTags if true, input has morphological analyses separated by tagDelimiter.
  * @param hasDomainLabels if true, input has a whitespace-terminated domain at the beginning
  *     of each line of text
  * @param stripRewrites if true, erase orthographical rewrites from the gold labels (for
  *     comparison purposes)
  * @param tokFactory a TokenizerFactory for the input
  */
  public ArabicDocumentReaderAndWriter(boolean hasSegMarkers,
      boolean hasTags,
      boolean hasDomainLabels,
      String domain,
      boolean stripRewrites,
      TokenizerFactory tokFactory) {
    tf = tokFactory;
    inputHasTags = hasTags;
    inputHasDomainLabels = hasDomainLabels;
    inputDomain = domain;
    shouldStripRewrites = stripRewrites;
    segMarker = hasSegMarkers ? DEFAULT_SEG_MARKER : null;
    factory = LineIterator.getFactory(new SerializableFunction>() {
      private static final long serialVersionUID = 5243251505653686497L;
      public List apply(String in) {
        List tokenList;
        
        String lineDomain = "";
        if (inputHasDomainLabels) {
          String[] domainAndData = in.split("\\s+", 2);
          if (domainAndData.length < 2) {
            log.info("Missing domain label or text: ");
            log.info(in);
          } else {
            lineDomain = domainAndData[0];
            in = domainAndData[1];
          }
        } else {
          lineDomain = inputDomain;
        }

        if (inputHasTags) {
          String[] toks = in.split("\\s+");
          List input = new ArrayList<>(toks.length);
          final String tagDelim = Pattern.quote(tagDelimiter);
          final String rewDelim = Pattern.quote(rewriteDelimiter);
          for (String wordTag : toks) {
            String[] wordTagPair = wordTag.split(tagDelim);
            assert wordTagPair.length == 2;
            String[] rewritePair = wordTagPair[0].split(rewDelim);
            assert rewritePair.length == 1 || rewritePair.length == 2;
            String raw = rewritePair[0];
            String rewritten = raw;
            if (rewritePair.length == 2)
              rewritten = rewritePair[1];

            CoreLabel cl = new CoreLabel();
            if (tf != null) {
              List lexListRaw = tf.getTokenizer(new StringReader(raw)).tokenize();
              List lexListRewritten = tf.getTokenizer(new StringReader(rewritten)).tokenize();
              if (lexListRewritten.size() != lexListRaw.size()) {
                System.err.printf("%s: Different number of tokens in raw and rewritten: %s>>>%s%n", this.getClass().getName(), raw, rewritten);
                lexListRewritten = lexListRaw;

              }
              if (lexListRaw.size() == 0) {
                continue;
              
              } else if (lexListRaw.size() == 1) {
                raw = lexListRaw.get(0).value();
                rewritten = lexListRewritten.get(0).value();
              
              } else if (lexListRaw.size() > 1) {
                String secondWord = lexListRaw.get(1).value();
                if (secondWord.equals(String.valueOf(segMarker))) {
                  // Special case for the null marker in the vocalized section
                  raw = lexListRaw.get(0).value() + segMarker;
                  rewritten = lexListRewritten.get(0).value() + segMarker;
                } else {
                  System.err.printf("%s: Raw token generates multiple segments: %s%n", this.getClass().getName(), raw);
                  raw = lexListRaw.get(0).value();
                  rewritten = lexListRewritten.get(0).value();
                }
              }
            }
            cl.setValue(raw);
            cl.setWord(raw);
            cl.setTag(wordTagPair[1]);
            cl.set(CoreAnnotations.DomainAnnotation.class, lineDomain);
            cl.set(RewrittenArabicAnnotation.class, rewritten);
            input.add(cl);
          }
          tokenList = IOBUtils.StringToIOB(input, segMarker, true, shouldStripRewrites);

        } else if (tf == null) {
          tokenList = IOBUtils.StringToIOB(in, segMarker);

        } else {
          List line = tf.getTokenizer(new StringReader(in)).tokenize();
          tokenList = IOBUtils.StringToIOB(line, segMarker, false);
        }
        
        if (inputHasDomainLabels && !inputHasTags)
          IOBUtils.labelDomain(tokenList, lineDomain);
        else if (!inputHasDomainLabels)
          IOBUtils.labelDomain(tokenList, inputDomain);
        return tokenList;
      }
    });
  }

  /**
   * Required, but unused.
   */
  public void init(SeqClassifierFlags flags) {}

  /**
   * Iterate over an input document.
   */
  public Iterator> getIterator(Reader r) {
    return factory.getIterator(r);
  }

  public void printAnswers(List doc, PrintWriter pw) {
    pw.println("Answer\tGoldAnswer\tCharacter");
    for(CoreLabel word : doc) {
      pw.printf("%s\t%s\t%s%n", word.get(CoreAnnotations.AnswerAnnotation.class),
                                word.get(CoreAnnotations.GoldAnswerAnnotation.class),
                                word.get(CoreAnnotations.CharAnnotation.class));
    }
  }
  
  /**
   * For debugging.
   * 
   * @param args
   * @throws IOException 
   */
  public static void main(String[] args) throws IOException {
    if (args.length != 1) {
      System.err.printf("Usage: java %s file > output%n", ArabicDocumentReaderAndWriter.class.getName());
      System.exit(-1);
    }
    String fileName = args[0];
    TokenizerFactory tokFactory = ArabicTokenizer.atbFactory();
    String atbVocOptions = "removeProMarker,removeMorphMarker";
    tokFactory.setOptions(atbVocOptions);
    
    BufferedReader reader = IOUtils.readerFromString(fileName);
    for (String line; (line = reader.readLine()) != null; ) {
      String[] toks = line.split("\\s+");
      final String delim = Pattern.quote(tagDelimiter);
      boolean isStart = true;
      for (String wordTag : toks) {
        String[] wordTagPair = wordTag.split(delim);
        assert wordTagPair.length == 2;
        String word = wordTagPair[0];
        if (tokFactory != null) {
          List lexList = tokFactory.getTokenizer(new StringReader(word)).tokenize();
          if (lexList.size() == 0) {
            continue;
          
          } else if (lexList.size() == 1) {
            word = lexList.get(0).value();
          
          } else if (lexList.size() > 1) {
            String secondWord = lexList.get(1).value();
            if (secondWord.equals(String.valueOf(DEFAULT_SEG_MARKER))) {
              // Special case for the null marker in the vocalized section
              word = lexList.get(0).value() + String.valueOf(DEFAULT_SEG_MARKER);
            } else {
              System.err.printf("%s: Raw token generates multiple segments: %s%n", ArabicDocumentReaderAndWriter.class.getName(), word);
              word = lexList.get(0).value();
            }
          }
        }
        if ( ! isStart ) System.out.print(" ");
        System.out.print(word);
        isStart = false;
      }
      System.out.println();
    }
   
//    DocumentReaderAndWriter docReader = new ArabicDocumentReaderAndWriter(true,
//        true,
//        false,
//        tokFactory);
//    Iterator> itr = docReader.getIterator(new InputStreamReader(new FileInputStream(new File(fileName))));
//    while(itr.hasNext()) {
//      List line = itr.next();
//      System.out.println(Sentence.listToString(line));
//    }
  }
}