All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.dev.dumpcheck.MixingSentenceSource Maven / Gradle / Ivy

There is a newer version: 6.3
Show newest version
/* LanguageTool, a natural language style checker
 * Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.dev.dumpcheck;

import org.apache.commons.lang.StringUtils;
import org.languagetool.Language;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;

/**
 * Alternately returns sentences from different sentence sources.
 * @since 2.4
 */
public class MixingSentenceSource extends SentenceSource {

  private final List sources;
  private final Map sourceDistribution = new HashMap<>();
  
  private int count;

  public static MixingSentenceSource create(List dumpFileNames, Language language) throws IOException {
    return create(dumpFileNames, language, null);
  }

  public static MixingSentenceSource create(List dumpFileNames, Language language, Pattern filter) throws IOException {
    List sources = new ArrayList<>();
    for (String dumpFileName : dumpFileNames) {
      File file = new File(dumpFileName);
      if (file.getName().endsWith(".xml")) {
        sources.add(new WikipediaSentenceSource(new FileInputStream(dumpFileName), language, filter));
      } else if (file.getName().startsWith("tatoeba-")) {
        sources.add(new TatoebaSentenceSource(new FileInputStream(dumpFileName), language, filter));
      } else if (file.getName().endsWith(".txt")) {
        sources.add(new PlainTextSentenceSource(new FileInputStream(dumpFileName), language, filter));
      } else {
        throw new RuntimeException("Could not find a source handler for " + dumpFileName +
                " - Wikipedia files must be named '*.xml', Tatoeba files must be named 'tatoeba-*'");
      }
    }
    return new MixingSentenceSource(sources, language);
  }

  private MixingSentenceSource(List sources, Language language) {
    super(language);
    this.sources = sources;
  }

  Map getSourceDistribution() {
    return sourceDistribution;
  }
  
  @Override
  public boolean hasNext() {
    for (SentenceSource source : sources) {
      if (source.hasNext()) {
        return true;
      }
    }
    return false;
  }

  @Override
  public Sentence next() {
    SentenceSource sentenceSource = sources.get(count % sources.size());
    while (!sentenceSource.hasNext()) {
      sources.remove(sentenceSource);
      if (sources.size() == 0) {
        throw new NoSuchElementException();
      }
      count++;
      sentenceSource = sources.get(count % sources.size());
    }
    count++;
    Sentence next = sentenceSource.next();
    updateDistributionMap(next);
    return next;
  }

  private void updateDistributionMap(Sentence next) {
    Integer prevCount = sourceDistribution.get(next.getSource());
    if (prevCount != null) {
      sourceDistribution.put(next.getSource(), prevCount + 1);
    } else {
      sourceDistribution.put(next.getSource(), 1);
    }
  }

  @Override
  public String getSource() {
    return StringUtils.join(sources, ", ");
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy