
org.terrier.indexing.TwitterJSONCollection Maven / Gradle / Ivy
The newest version!
/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org/
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is TwitterJSONDocument.java
*
* The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Richard McCreadie (original contributor)
*/
package org.terrier.indexing;
import gnu.trove.TLongHashSet;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.Files;
import com.google.gson.JsonObject;
import com.google.gson.JsonStreamParser;
/**
* This class represents a collection of tweets stored in JSON
* format. Like TRECCollection, it expects a collection specification
* containing all of the files to be read. Each file is assumed to be in
* gzip format, with one tweet per line. The google.gson parser is used
* to read the tweet JSON. The FlatJSONDocument representation is used.
*
* @author Richard McCreadie
* @since 4.0
*
*/
public class TwitterJSONCollection implements Collection {
/** logger for this class */
protected static final Logger logger = LoggerFactory.getLogger(TwitterJSONCollection.class);
/** The list of files to process. */
protected List FilesToProcess = null;
/** A boolean which is true when a new file is open.*/
protected boolean SkipFile = false;
/** The JSON stream containing the tweets */
protected JsonStreamParser JSONStream = null;
/** The underlying file stream reading tweets from the current file */
protected BufferedReader currentTweetStream = null;
/** The current document */
protected Document currentDocument = null;
/** The name of the current file */
protected String currentFilename = null;
/** The index in the FilesToProcess of the currently processed file.*/
protected int FileNumber = -1;
/** Have we reached the end of the collection yet? */
protected boolean endOfCollection = false;
TLongHashSet alldocnos = new TLongHashSet();
public TwitterJSONCollection(String CollectionSpecFile) {
readCollectionSpec(CollectionSpecFile);
//open the first file
try {
openNextFile();
} catch (IOException ioe) {
logger.error("IOException opening first file of collection - is the collection.spec correct?", ioe);
}
}
public TwitterJSONCollection() {}
/** additional constructors required by TRECIndexing */
public TwitterJSONCollection(String addressCollectionFilename, String ignored1, String ignored2, String ignored3) {
this(addressCollectionFilename);
}
public TwitterJSONCollection(List files, String ignored1, String ignored2, String ignored3) {
FilesToProcess = files;
//open the first file
try {
openNextFile();
} catch (IOException ioe) {
logger.error("IOException opening first file of collection - is the collection.spec correct?", ioe);
}
}
public void init() {
readCollectionSpec(ApplicationSetup.COLLECTION_SPEC);
//open the first file
try {
openNextFile();
} catch (IOException ioe) {
logger.error("IOException opening first file of collection - is the collection.spec correct?", ioe);
}
}
protected void loadJSON(String file) throws IOException {
if (file.endsWith("bz2")) currentTweetStream = new BufferedReader(new InputStreamReader(new BZip2CompressorInputStream(new FileInputStream(file)),"UTF-8"));
else currentTweetStream = Files.openFileReader(file, "UTF-8");
JSONStream = new JsonStreamParser(currentTweetStream);
}
public void addFileToProcess(String JSONFile) {
if (FilesToProcess==null) FilesToProcess = new ArrayList();
FilesToProcess.add(JSONFile);
}
protected void readCollectionSpec(String CollectionSpecFilename)
{
//reads the collection specification file
try {
BufferedReader br2 = Files.openFileReader(CollectionSpecFilename);
String filename = null;
FilesToProcess = new ArrayList();
while ((filename = br2.readLine()) != null) {
filename = filename.trim();
if (!filename.startsWith("#") && !filename.equals(""))
FilesToProcess.add(filename);
}
br2.close();
logger.info("TRECCollection read collection specification ("+FilesToProcess.size()+" files)");
} catch (IOException ioe) {
logger.error("Input output exception while loading the collection.spec file. "
+ "("+CollectionSpecFilename+")", ioe);
}
}
/**
* Opens the next document from the collection specification.
* @return boolean true if the file was opened successufully. If there
* are no more files to open, it returns false.
* @throws IOException if there is an exception while opening the
* collection files.
*/
public boolean openNextFile() throws IOException {
//try to close the currently open file
if (currentTweetStream!=null && FilesToProcess.size() > 0)
try{
close();
}catch (IOException ioe) {
logger.warn("IOException while closing file being read", ioe);
}
//keep trying files
boolean tryFile = true;
//return value for this fn
boolean rtr = false;
while(tryFile)
{
if (FileNumber < FilesToProcess.size() -1 ) {
SkipFile = true;
FileNumber++;
String filename = (String) FilesToProcess.get(FileNumber);
//check the filename is sane
if (! Files.exists(filename))
{
logger.warn("Could not open "+filename+" : File Not Found");
}
else if (! Files.canRead(filename))
{
logger.warn("Could not open "+filename+" : Cannot read");
}
else
{ //filename seems ok, open it
loadJSON(filename); //throws an IOException, throw upwards
logger.info("Processing "+filename);
currentFilename = filename;
//no need to loop again
tryFile = false;
//return success
rtr = true;
}
} else {
//last file of the collection has been read, EOC
endOfCollection = true;
rtr = false;
tryFile = false;
}
}
return rtr;
}
@Override
public void close() throws IOException {
if (currentTweetStream!=null) currentTweetStream.close();
}
@Override
public boolean nextDocument() {
if (FilesToProcess==null) init();
boolean nextOK = false;
try {
nextOK = JSONStream.hasNext();
} catch (Exception e1) {
logger.warn("Exception when checking if JSONStream has another document", e1);
}
if (nextOK) {
currentDocument = new FlatJSONDocument(readTweet());
return true;
} else {
try {
return openNextFile();
} catch (IOException e) {
e.printStackTrace();
}
}
return false;
}
public JsonObject readTweet() {
JsonObject json = JSONStream.next().getAsJsonObject();
return json;
}
@Override
public Document getDocument() {
long docno;
try {
docno = Long.parseLong(((FlatJSONDocument)currentDocument).getProperty("docno") );
} catch (Exception e) {
logger.warn("WARN: Parsing failure... skipping document");
return null;
}
if(alldocnos.contains(docno))
return null;
alldocnos.add(docno);
return currentDocument;
}
@Override
public boolean endOfCollection() {
return endOfCollection;
}
@Override
public void reset() {
logger.error("WARN: TwitterJSONCollection.reset() was called but it has not been implemented.");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy