io.anserini.collection.WashingtonPostCollection Maven / Gradle / Ivy
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.anserini.collection;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonGetter;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonSetter;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.datatype.jdk8.Jdk8Module;
import org.jsoup.Jsoup;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
/**
* An instance of the TREC Washington Post Corpus.
* This class works for both v2 and v3 of the Washington Post corpus.
*
* Collection v2 contains 608,180 news articles and blog posts from January 2012 through August 2017,
* stored in JSON format. The collection is 1.5GB compressed, 5.9GB uncompressed.
*
* Collection v3 contains an additional 154,418 new documents from 2018 and 2019 and removes (near)
* duplicates, resulting in a total of 671,947 news articles and blog posts from January 2012
* through December 2019, The collection is 1.8GB compressed, 6.8GB uncompressed.
*/
public class WashingtonPostCollection extends DocumentCollection {
public WashingtonPostCollection(Path path) {
this.path = path;
this.allowedFileSuffix = new HashSet<>(Arrays.asList(".txt", ".jl"));
}
public WashingtonPostCollection() {
}
@Override
public FileSegment createFileSegment(Path p) throws IOException {
return new Segment(p);
}
@Override
public FileSegment createFileSegment(BufferedReader bufferedReader) throws IOException {
return new Segment(bufferedReader);
}
/**
* A file containing multiple documents from the TREC Washington Post Corpus.
* The corpus is distributed as a single file.
*/
public static class Segment extends FileSegment {
private String fileName;
public Segment(Path path) throws IOException {
super(path);
this.fileName = path.toString();
this.bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(this.fileName), "utf-8"));
}
public Segment(BufferedReader bufferedReader) throws IOException {
super(bufferedReader);
}
@Override
public void readNext() throws IOException {
String nextRecord = bufferedReader.readLine();
if (nextRecord == null) {
throw new NoSuchElementException();
}
parseRecord(nextRecord);
}
private void parseRecord(String record) {
ObjectMapper mapper = new ObjectMapper();
Document.WashingtonPostObject wapoObj = null;
try {
wapoObj = mapper
.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES) // Ignore unrecognized properties
.registerModule(new Jdk8Module()) // Deserialize Java 8 Optional: http://www.baeldung.com/jackson-optional
.readValue(record, Document.WashingtonPostObject.class);
} catch (IOException e) {
// For current dataset, we can make sure all record has unique id and
// published date. So we just simply throw an RuntimeException
// here in case future data may bring up this issue
throw new RuntimeException(e);
}
bufferedRecord = new WashingtonPostCollection.Document();
bufferedRecord.id = wapoObj.getId();
bufferedRecord.publishDate = wapoObj.getPublishedDate();
bufferedRecord.title = wapoObj.getTitle();
bufferedRecord.articleUrl = wapoObj.getArticleUrl();
bufferedRecord.author = wapoObj.getAuthor();
bufferedRecord.obj = wapoObj;
bufferedRecord.raw = record;
}
}
/**
* A document from the TREC Washington Post Corpus.
*/
public static class Document implements SourceDocument {
public static final List CONTENT_TYPE_TAG = Arrays.asList("sanitized_html", "tweet");
// Required fields
protected String id;
protected Optional articleUrl;
protected Optional author;
protected long publishDate;
protected Optional title;
protected String raw;
protected WashingtonPostObject obj;
protected String fullCaption = null;
protected String kicker = null;
private String removeTags(String content) {
return Jsoup.parse(content).text();
}
@Override
public String id() {
return id;
}
@Override
public String contents() {
StringBuilder contentBuilder = new StringBuilder();
getTitle().ifPresent(title -> contentBuilder.append(title).append("\n"));
getObj().getContents().ifPresent(contents -> {
for (WashingtonPostObject.Content contentObj : contents) {
if (contentObj == null) continue;
if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
contentObj.getType().ifPresent(type -> {
contentObj.getContent().ifPresent(content -> {
if (CONTENT_TYPE_TAG.contains(type)) {
contentBuilder.append(removeTags(content)).append("\n");
} else if (type.compareToIgnoreCase("kicker") == 0) {
kicker = content;
contentBuilder.append(content).append("\n");
} else if (type.compareToIgnoreCase("date") == 0 && this.publishDate == 0) {
this.publishDate = Long.parseLong(content);
}
});
});
}
contentObj.getFullCaption().ifPresent(caption -> {
fullCaption = contentObj.getFullCaption().get();
contentBuilder.append(removeTags(fullCaption)).append("\n");
});
}
});
return contentBuilder.toString();
}
@Override
public String raw() {
return raw;
}
@Override
public boolean indexable() {
return true;
}
public String getFullCaption() {
return fullCaption;
}
public String getKicker() {
return kicker;
}
public Optional getArticleUrl() {
return articleUrl;
}
public Optional getAuthor() {
return author;
}
public long getPublishDate() {
return publishDate;
}
public Optional getTitle() {
return title;
}
public WashingtonPostObject getObj() {
return obj;
}
/**
* Used internally by Jackson for JSON parsing.
*/
public static class WashingtonPostObject {
// Required fields
protected String id;
protected Optional articleUrl;
protected Optional author;
protected Optional title;
// Optional fields
protected long publishedDate;
protected Optional> contents;
/**
* Used internally by Jackson for JSON parsing.
*/
@SuppressWarnings("unchecked")
public static class ContentJsonDeserializer extends JsonDeserializer {
@Override
public Content deserialize(com.fasterxml.jackson.core.JsonParser jsonParser,
DeserializationContext context) throws IOException {
Map contentMap = jsonParser.readValueAs(Map.class);
Content content = new Content();
content.setType(getType(contentMap));
content.setContent(getContent(contentMap));
content.setFullCaption(getFullCaption(contentMap));
return content;
}
private Optional getType(Map map) {
Object type = map.get("type");
if (type == null) {
return Optional.empty();
}
return Optional.of(type.toString());
}
private Optional getContent(Map map) {
Object contentObj = map.get("content");
if (contentObj == null) {
return Optional.empty();
}
StringBuilder contentStringBuilder = new StringBuilder();
if (contentObj instanceof String) {
contentStringBuilder.append(contentObj);
} else if (contentObj instanceof Long) {
contentStringBuilder.append(String.valueOf(contentObj));
} else if (contentObj instanceof List) {
for (Object content: (List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy