org.apache.lucene.benchmark.utils.ExtractReuters Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of lucene-benchmark Show documentation
Apache Lucene (module: benchmark)
There is a newer version: 9.11.1
package org.apache.lucene.benchmark.utils;
/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.util.IOUtils;


/**
 * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
 */
public class ExtractReuters {
  private File reutersDir;
  private File outputDir;
  private static final String LINE_SEPARATOR = System.getProperty("line.separator");

  public ExtractReuters(File reutersDir, File outputDir) {
    this.reutersDir = reutersDir;
    this.outputDir = outputDir;
    System.out.println("Deleting all files in " + outputDir);
    for (File f : outputDir.listFiles()) {
      f.delete();
    }
  }

  public void extract() {
    File[] sgmFiles = reutersDir.listFiles(new FileFilter() {
      @Override
      public boolean accept(File file) {
        return file.getName().endsWith(".sgm");
      }
    });
    if (sgmFiles != null && sgmFiles.length > 0) {
      for (File sgmFile : sgmFiles) {
        extractFile(sgmFile);
      }
    } else {
      System.err.println("No .sgm files in " + reutersDir);
    }
  }

  Pattern EXTRACTION_PATTERN = Pattern
      .compile("(.*?)|(.*?)|(.*?)");

  private static String[] META_CHARS = { "&", "<", ">", "\"", "'" };

  private static String[] META_CHARS_SERIALIZATIONS = { "&", "<",
      ">", """, "'" };

  /**
   * Override if you wish to change what is extracted
   */
  protected void extractFile(File sgmFile) {
    try {
      BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(sgmFile), IOUtils.CHARSET_UTF_8));

      StringBuilder buffer = new StringBuilder(1024);
      StringBuilder outBuffer = new StringBuilder(1024);

      String line = null;
      int docNumber = 0;
      while ((line = reader.readLine()) != null) {
        // when we see a closing reuters tag, flush the file

        if (line.indexOf(" org.apache.lucene.benchmark.utils.ExtractReuters  ");
  }
  
}