 
                        
        
                        
        io.anserini.collection.ClueWeb09Collection Maven / Gradle / Ivy
/*
 * Anserini: A Lucene toolkit for reproducible information retrieval research
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.anserini.collection;
import org.apache.commons.io.input.ReaderInputStream;
import org.apache.logging.log4j.LogManager;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
/**
 * An instance of the ClueWeb09 collection.
 * This can be used to read the complete ClueWeb09 collection or the smaller ClueWeb09b subset.
 */
public class ClueWeb09Collection extends DocumentCollection {
  public ClueWeb09Collection(Path path) {
    this.path = path;
    this.allowedFileSuffix = Set.of(".warc.gz");
  }
  public ClueWeb09Collection() {
  }
  @Override
  public FileSegment createFileSegment(Path p) throws IOException {
    return new Segment(p);
  }
  @Override
  public FileSegment createFileSegment(BufferedReader bufferedReader) throws IOException {
    return new Segment(bufferedReader);
  }
  /**
   * An individual WARC in the ClueWeb09 collection.
   */
  public static class Segment extends FileSegment {
    protected DataInputStream stream;
    private String rawContent = null; // raw content from buffered string
    public Segment(Path path) throws IOException {
      super(path);
      this.stream = new DataInputStream(new GZIPInputStream(Files.newInputStream(path, StandardOpenOption.READ)));
    }
    public Segment(BufferedReader bufferedReader) throws IOException {
      super(bufferedReader);
      rawContent = "Content-Length:\n" + bufferedReader.lines().collect(Collectors.joining("\n"));
    }
    @Override
    public void readNext() throws IOException, NoSuchElementException {
      if (rawContent != null) {
        bufferedRecord = Document.readNextWarcRecord(rawContent);
        rawContent = null;
      } else {
        bufferedRecord = Document.readNextWarcRecord(stream);
      }
    }
    @Override
    public void close() {
      try {
        if (stream != null) {
          stream.close();
        }
        super.close();
      } catch (IOException e) {
        // There's really nothing to be done, so just silently eat the exception.
      }
    }
  }
  /**
   * A document from the ClueWeb09 collection.
   * This class derives from tools provided by CMU for reading the ClueWeb09 collection.
   */
  public static class Document extends WarcBaseDocument {
    static {
      LOG = LogManager.getLogger(Document.class);
      WARC_VERSION = "WARC/0.18";
    }
    /**
     * Reads in a WARC record from a data input stream.
     *
     * @param in      the input stream
     * @return a WARC record (or null if EOF)
     * @throws IOException if error encountered reading from stream
     */
    public static Document readNextWarcRecord(DataInputStream in)
        throws IOException {
      StringBuilder recordHeader = new StringBuilder();
      byte[] recordContent = readNextRecord(in, recordHeader, "Content-Length");
      Document retRecord = new Document();
      retRecord.setHeader(recordHeader.toString());
      retRecord.setContent(recordContent);
      return retRecord;
    }
    /**
     * Reads in a WARC record from a data input stream.
     *
     * @param rawContent the input raw string
     * @return a WARC record (or null if EOF)
     */
    public static Document readNextWarcRecord(String rawContent) {
      StringBuilder recordHeader = new StringBuilder();
      byte[] recordContent = rawContent.getBytes();
      Document retRecord = new Document();
      retRecord.setHeader("");
      retRecord.setContent(recordContent);
      return retRecord;
    }
    @Override
    public String getContent() {
      String str = getContentUTF8();
      int i = str.indexOf("Content-Length:");
      int j = str.indexOf("\n", i);
      return str.substring(j + 1);
    }
    @Override
    public String getDocid() {
      return getHeaderMetadataItem("WARC-TREC-ID");
    }
  }
}
    © 2015 - 2025 Weber Informatics LLC | Privacy Policy