org.apache.lucene.benchmark.quality.trec.TrecTopicsReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-benchmark Show documentation
Apache Lucene (module: benchmark)
There is a newer version: 9.11.1
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.benchmark.quality.trec;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;

import org.apache.lucene.benchmark.quality.QualityQuery;

/**
 * Read TREC topics.
 * 
 * Expects this topic format -
 * 
 *   <top>
 *   <num> Number: nnn
 *     
 *   <title> title of the topic
 *     
 *   <desc> Description:
 *   description of the topic
 *     
 *   <narr> Narrative:
 *   "story" composed by assessors.
 *    
 *   </top>
 * 
 * Comment lines starting with '#' are ignored.
 */
public class TrecTopicsReader {

  private static final String newline = System.getProperty("line.separator");
  
  /**
   *  Constructor for Trec's TopicsReader
   */
  public TrecTopicsReader() {
    super();
  }

  /**
   * Read quality queries from trec format topics file.
   * @param reader where queries are read from.
   * @return the result quality queries.
   * @throws IOException if cannot read the queries.
   */
  public QualityQuery[] readQueries(BufferedReader reader) throws IOException {
    ArrayList res = new ArrayList();
    StringBuffer sb;
    try {
      while (null!=(sb=read(reader,"",null,false,false))) {
        HashMap fields = new HashMap();
        // id
        sb = read(reader,"",null,true,false);
        int k = sb.indexOf(":");
        String id = sb.substring(k+1).trim();
        // title
        sb = read(reader,"",null,true,false);
        k = sb.indexOf(">");
        String title = sb.substring(k+1).trim();
        // description
        sb = read(reader,"<desc>",null,false,false);
        sb = read(reader,"<narr>",null,false,true);
        String descripion = sb.toString().trim();
        // we got a topic!
        fields.put("title",title);
        fields.put("description",descripion);
        QualityQuery topic = new QualityQuery(id,fields);
        res.add(topic);
        // skip narrative, get to end of doc
        read(reader,"</top>",null,false,false);
      }
    } finally {
      reader.close();
    }
    // sort result array (by ID) 
    QualityQuery qq[] = (QualityQuery[]) res.toArray(new QualityQuery[0]);
    Arrays.sort(qq);
    return qq;
  }

  // read until finding a line that starts with the specified prefix
  private StringBuffer read (BufferedReader reader, String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws IOException {
    sb = (sb==null ? new StringBuffer() : sb);
    String sep = "";
    while (true) {
      String line = reader.readLine();
      if (line==null) {
        return null;
      }
      if (line.startsWith(prefix)) {
        if (collectMatchLine) {
          sb.append(sep+line);
          sep = newline;
        }
        break;
      }
      if (collectAll) {
        sb.append(sep+line);
        sep = newline;
      }
    }
    //System.out.println("read: "+sb);
    return sb;
  }
}
</code></pre>    <br/>
    <br/>
    <!--<div id="right-banner">-->
            <!--</div>-->
    <!--<div id="left-banner">-->
            <!--</div>-->
<div class='clear'></div>
</main>
</div>
<br/><br/>
    <div class="align-center">© 2015 - 2024 <a href="/legal-notice.php">Weber Informatics LLC</a> | <a href="/data-protection.php">Privacy Policy</a></div>
<br/><br/><br/><br/><br/><br/>
</body>
</html>