All Downloads are FREE. Search and download functionalities are using the official Maven repository.

main.java.cc.twittertools.search.TrecTopicSet Maven / Gradle / Ivy

The newest version!
package cc.twittertools.search;

import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.common.base.Charsets;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.io.Files;

public class TrecTopicSet implements Iterable{
  private List queries = Lists.newArrayList();

  private TrecTopicSet() {}

  private void add(TrecTopic q) {
    queries.add(q);
  }

  @Override
  public Iterator iterator() {
    return queries.iterator();
  }

  private static final Pattern TOP_PATTERN = Pattern.compile("", Pattern.DOTALL);
  private static final Pattern NUM_PATTERN = Pattern.compile(" Number: (MB\\d+) ", Pattern.DOTALL);

  // TREC 2011 topics uses  tag
  private static final Pattern TITLE_PATTERN = Pattern.compile("<title>\\s*(.*?)\\s*", Pattern.DOTALL);
  // TREC 2012 topics use  tag
  private static final Pattern TITLE_PATTERN2 = Pattern.compile("\\s*(.*?)\\s*", Pattern.DOTALL);

  private static final Pattern TWEETTIME_PATTERN = Pattern.compile("\\s*(\\d+)\\s*", Pattern.DOTALL);

  public static TrecTopicSet fromFile(File f) throws IOException {
    Preconditions.checkNotNull(f);
    Preconditions.checkArgument(f.exists());

    String s = Joiner.on("\n").join(Files.readLines(f, Charsets.UTF_8));
    TrecTopicSet queries = new TrecTopicSet();

    Matcher matcher = TOP_PATTERN.matcher(s);
    while (matcher.find()) {
      String top = matcher.group(0);

      Matcher m = NUM_PATTERN.matcher(top);
      if (!m.find()) {
        throw new IOException("Error parsing " + f);
      }
      String id = m.group(1);
      // Topics from 2012 are inconsistently numbered,
      // e.g., MB051 should match the qrels, which has MB51
      if (id.matches("MB0\\d\\d")) {
        id = id.replace("MB0", "MB");
      }

      m = TITLE_PATTERN.matcher(top);
      if (!m.find()) {
        m = TITLE_PATTERN2.matcher(top);
        if (!m.find()) {
          throw new IOException("Error parsing " + f);
        }
      }
      String text = m.group(1);

      m = TWEETTIME_PATTERN.matcher(top);
      if (!m.find()) {
        throw new IOException("Error parsing " + f);
      }
      long time = Long.parseLong(m.group(1));
      queries.add(new TrecTopic(id, text, time));
    }
    return queries;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy