All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.terrier.applications.batchquerying.TRECQuery Maven / Gradle / Ivy

The newest version!
/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is TRECQuery.java.
 *
 * The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Ben He  (original author) 
 *   Craig Macdonald 
 */
package org.terrier.applications.batchquerying;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.StringTokenizer;
import java.util.Vector;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.terrier.applications.batchquerying.QuerySource;
import org.terrier.indexing.TRECFullTokenizer;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.Files;
import org.terrier.utility.TagSet;
/**
 * This class is used for reading the queries 
 * from TREC topic files.
 * 

Properties:

*
    *
  • trecquery.ignore.desc.narr.name.tokens - should the token DESCRIPTION and NARRATIVE in the desc and narr fields be ignored? Defaluts to true
  • *
  • tokeniser - name of the Tokeniser class to use to tokenise topics. Defaults to EnglishTokeniser.
  • *
  • trec.encoding - use to set the encoding of TREC topic files. Defaults to the systems default encoding.
  • *
* @author Ben He & Craig Macdonald */ public class TRECQuery implements QuerySource { /** The logger used for this class */ protected static final Logger logger = LoggerFactory.getLogger(TRECQuery.class); /** Value of trecquery.ignore.desc.narr.name.tokens - should the token DESCRIPTION and NARRATIVE in the desc and narr fields be ignored? Defaluts to true? */ protected static final boolean IGNORE_DESC_NARR_NAME_TOKENS = Boolean.parseBoolean(ApplicationSetup.getProperty("trecquery.ignore.desc.narr.name.tokens","true")); /** Encoding to be used to open all files. */ protected String desiredEncoding = ApplicationSetup.getProperty("trec.encoding", null); /** The topic files used in this object */ protected String[] topicFiles; /** The queries in the topic files.*/ protected String[] queries = null; /** The query identifiers in the topic files.*/ protected String[] query_ids = null; /** The index of the queries.*/ protected int index; protected TagSet tags; /** * Extracts and stores all the queries from query files. * @param queryfilenames String the name of files containing topics. * @param vecStringQueries Vector a vector containing the * queries as strings. * @param vecStringIds Vector a vector containing the query * identifiers as strings. * @return boolean true if some queries were successfully extracted. */ public boolean extractQuery(String[] queryfilenames, TagSet t, Vector vecStringQueries, Vector vecStringIds) { boolean rtn = false; for (int i=0;i vecStringQueries, Vector vecStringIds) { boolean gotSome = false; try { BufferedReader br; if (! Files.exists(queryfilename) || ! Files.canRead(queryfilename)) { logger.error("The topics file " + queryfilename + " does not exist, or it cannot be read."); return false; } else { br = Files.openFileReader(queryfilename,desiredEncoding); TRECFullTokenizer queryTokenizer = new TRECFullTokenizer( t, new TagSet(TagSet.EMPTY_TAGS), br); queryTokenizer.setIgnoreMissingClosingTags(true); while (!queryTokenizer.isEndOfFile()) { String docnoToken = null; StringBuilder query = new StringBuilder(); boolean seenDescriptionToken = ! IGNORE_DESC_NARR_NAME_TOKENS; boolean seenNarrativeToken = ! IGNORE_DESC_NARR_NAME_TOKENS; while (!queryTokenizer.isEndOfDocument()) { String token = queryTokenizer.nextToken(); if (token == null || token.length() == 0 || queryTokenizer.inTagToSkip()) continue; if (queryTokenizer.inDocnoTag()) { //The tokenizer is constructed from the trimmed version of the contents //of the query number tag, ignoring the token Number: StringTokenizer docnoTokens = new StringTokenizer(token.trim(), " "); while (docnoTokens.hasMoreTokens()) { String tok = docnoTokens.nextToken().trim(); if (! tok.equalsIgnoreCase("number")) docnoToken = tok; } } else if (queryTokenizer.inTagToProcess()) { // Removed the code that checks if "description" and // "narrative" appear in "desc" and "narr", respective. // THIS WILL HURT THE RETRIEVAL PERFORMANCE. Therefore, // it is recommended to add these words in the stopword // list. if (!seenDescriptionToken && queryTokenizer .currentTag() .equalsIgnoreCase("DESC") && token.equalsIgnoreCase("DESCRIPTION")) continue; if (!seenNarrativeToken && queryTokenizer .currentTag() .equalsIgnoreCase("NARR") && token.equalsIgnoreCase("NARRATIVE")) continue; query.append(token); query.append(' '); } } queryTokenizer.nextDocument(); if (query.length() == 0) continue; vecStringQueries.add(query.toString().trim()); if (docnoToken == null) throw new IOException("No id tag found for this query"); vecStringIds.add(docnoToken); gotSome = true; } //after processing each query file, close the BufferedReader br.close(); } }catch (IOException ioe) { logger.error("Input/Output exception while extracting queries from the topic file named "+queryfilename, ioe); } return gotSome; } public TRECQuery(String[] queryfilenames, String docTag, String idTag, String[] whitelist, String[] blacklist) { TagSet.TagSetFactory fact = TagSet.factory().setDocTag(docTag).setIdTag(idTag); if (whitelist != null) fact.setWhitelist(whitelist); if (blacklist != null) fact.setBlacklist(blacklist); this.tags = fact.build(); this.topicFiles = queryfilenames; } /** * Constructs an instance of TRECQuery, * that reads and stores all the queries from * the files defined in the trec.topics property. */ public TRECQuery() { this(ArrayUtils.parseCommaDelimitedString(ApplicationSetup.getProperty("trec.topics", ""))); } /** * Constructs an instance of TRECQuery that * reads and stores all the queries from a * file with the specified filename. * @param queryfilename String the name of the file containing * all the queries. */ public TRECQuery(String queryfilename){ this(new String[]{queryfilename}); } /** * Constructs an instance of TRECQuery that * reads and stores all the queries from * files with the specified filename. * @param queryfilenames String[] the name of the files containing * all the queries. */ public TRECQuery(String[] queryfilenames){ this.topicFiles = queryfilenames; this.tags = new TagSet(TagSet.TREC_QUERY_TAGS); checkEncoding(); } protected void checkEncoding() { if (desiredEncoding == null) { String defaultEncoding = Charset.defaultCharset().name(); if (! defaultEncoding.equals("UTF-8")) { logger.warn("trec.encoding is not set; resorting to platform default ("+defaultEncoding+"). Retrieval may be platform dependent. Recommend trec.encoding=UTF-8"); } desiredEncoding = defaultEncoding; } } protected void performExtraction(){ Vector vecStringQueries = new Vector(); Vector vecStringQueryIDs = new Vector(); checkEncoding(); if (! this.extractQuery(this.topicFiles, this.tags, vecStringQueries, vecStringQueryIDs)) { logger.error("Topic files were specified, but non could be parsed correctly to obtain any topics." + " Check you have the correct topic files specified, and that tags are correct."); return; } this.queries = vecStringQueries.toArray(new String[0]); this.query_ids = vecStringQueryIDs.toArray(new String[0]); this.index = 0; } /** * Returns the index of the last obtained query. * @return int the index of the last obtained query. */ public int getIndexOfCurrentQuery() { return index - 1; } /** * Returns the number of the queries read from the * processed topic files. * @return int the number of topics contained in the * processed topic files. */ public int getNumberOfQueries() { return queries.length; } /** Returns the filenames of the topic files from which the queries were extracted */ public String[] getInfo() { return this.topicFiles; } /** * Return the query for the given query number. * @return String the string representing the query. * @param queryNo String The number of a query. */ public String getQuery(String queryNo) { for (int i = 0; i < query_ids.length; i++) if (query_ids[i].equals(queryNo)) return queries[i]; return null; } /** * {@inheritDoc} */ public boolean hasNext() { if (queries == null) performExtraction(); if (index == queries.length) return false; return true; } /** * {@inheritDoc} */ public String next() { if (queries == null) performExtraction(); if (index == queries.length) return null; return queries[index++]; } /** {@inheritDoc} */ public String getQueryId() { return query_ids[index == 0 ? 0 : index-1]; } /** Returns the query ids * @return String array containing the query ids. * @since 2.2 */ public String[] getQueryIds() { return query_ids; } /** * Returns the queries in an array of strings * @return String[] an array containing the strings that * represent the queries. */ public String[] toArray() { return (String[]) queries.clone(); } /** {@inheritDoc} */ public void reset() { this.index = 0; } /** * {@inheritDoc} */ public void remove() { throw new UnsupportedOperationException(); } /** * main * @param args */ public static void main(String[] args) { TRECQuery source = new TRECQuery(args[0]); while(source.hasNext()) { String query = source.next(); String id = source.getQueryId(); System.out.println(id + ": " + query); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy