edu.ucla.sspace.tri.FixedDurationTemporalRandomIndexing Maven / Gradle / Ivy
Show all versions of sspace-wordsi Show documentation
/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package edu.ucla.sspace.tri;
import java.util.Properties;
import edu.ucla.sspace.util.TimeSpan;
/**
* A {@link edu.ucla.sspace.temporal.TemporalSemanticSpace
* TemporalSemanticSpace} class that optimizes a special case of {@link
* TemporalRandomIndexing} where the documents are in sorted order and the
* duration of a semantic partition is fixed. Specifically, the following three
* properties are required:
*
* - A fixed length semantic partition will be used throughout the entire
* data processing, as specified by a {@link TimeSpan}.
*
*
- All documents are processed in order from least recent to most recent
*
*
- If a new document's timestamp exceeds the current semenatic slice's
* duration (i.e. that it would be a part of a different semantic partitions),
* the current semantic partitons's data may be discarded and a new partition
* created.
*
*
*
* This implementation is based on:
*
* - D. Jurgens and K. Stevens,
* "Event Detection in Blogs using Temporal Random Indexing," in
* Proceedings of the Events in Emerging Text Types (eETTs) Workshop at
* RANLP 2009. Borovets, Bulgaria, 2009.
*
*
*
* In addition to the properties specified in {@link
* OrderedTemporalRandomIndexing}, this class defines the following configurable
* properties:
*
*
*
* - Property:
{@value #SEMANTIC_PARTITION_DURATION_PROPERTY}
*
* Default: 1 month
*
* - This property specifies a {@link TimeSpan}
* configuration string that will be used to determine the duration of all
* semantic partitions generated by this instance.
*
*
*
* This class does not support arbitrary multithreading of the {@link
* processDocument(BufferedReader) processDocument} method. However, it
* does support concurrent calls provided that all the documents are
* within the semantic partitions. That is, multiple threads may be used to process
* all of a semantic partitions documents, provided that the documents from the next
* partition are not interleaved. If the document orderering and time span are
* known ahead of time, multi-threading can be done with a {@link
* java.util.concurrent.CyclicBarrier CyclicBarrier}. The following is an
* example of how to correctly multi-thread this class.
*
*
* // Initialize the following variables according to program semantics
* int numThreads;
* TimeSpan partitionDuration;
* Iterator<TemporalDocument> documents;
* FixedDurationTemporalRandomIndexing fdTRI;
*
* // As threads finish processing a semantic partition, they add the value of the
* // next time stamp as a key in this map, which allows the processing thread
* // (see partitionHook below) to determine the start time of the next partition
* ConcurrentNavigableMap<Long,Object> futureStartTimes =
* new ConcurrentSkipList<Long,Object>();
*
* // Create a custom Runnable that will handle processing the semantic space
* // after the partition has been finished.
* Runnable partitionHook = new Runnable() {
* // Process the semantic space as necessary here...
*
* // Once processing has finished, notify the threads of the next
* // time stamp that will be processed. In the unlikely event that
* // the number of documents in a partition would be less than the number of
* // threads, this ensures that thread processing the partition after the next
* // correctly waits.
* Long ssStart = futureStartTimes.firstKey();
* futureStartTimes.clear(); // reset for next partition
*
* // last update the date with the new time
* curSSpaceStartTime.set(ssStart);
* }
*
* // Create the barrier that the threads will use to synchronize their
* // processDocument() calls. Note that we use the partition hook here
* // instead of attaching it via the addPartitionHook() method
* final CyclicBarrier exceededTimeSpanBarrier =
* new CyclicBarrier(numThreads, partitionHook);
*
* // A required barrier for the initial case of setting the start time for the
* // first partition
* final AtomicBoolean startBarrier = new AtomicBoolean(false);
*
* // The starting time for the current semantic partition. This value is used to
* // determine if processing the next document would cause the current partition
* // to be partitioned and a new partition created.
* final AtomicLong startTimeOfCurrentPartition = new AtomicLong();
*
* // Before a Thread blocks waiting for partition processing, it enqueues the
* // time for its next document (exceeding the duration). These times are used
* // to select the start time for the next partition.
* final Queue futureStartTimes = new ConcurrentLinkedQueue();
*
* // A counter for which document is being processed
* final AtomicInteger docCounter = new AtomicInteger(0);
*
* // Start all the threads
* for (int i = 0; i < numThreads; ++i) {
*
* Thread processingThread = new Thread() {
* public void run() {
*
* // repeatedly try to process any remaining documents
* while (documents.hasNext()) {
*
* TemporalDocument doc = docuemnts.next();
* long docTime = doc.timeStamp();
* int docNumber = docCounter.incrementAndGet();
*
* // special case for first document
* if (docNumber == 1) {
* startTimeOfCurrentPartition.set(docTime);
* startBarrier.set(true);
* }
*
* // Spin until the Thread with the first document sets the
* // initial starting document time. Note that we spin here
* // instead of block, because this is expected that another
* // thread will immediately set this and so it will be a
* // quick no-op
* while (startBarrier.get() == false)
* ;
*
* // Check whether the time for this document would exceed the
* // maximum duration of the current partition. Loop to ensure
* // that if this thread does loop and another thread has an
* // earlier time that exceeds the time period, then this
* // thread will block until the earlier partition has finished
* // processing
* while (!timeSpan.insideRange(startTimeOfCurrentPartition.get(), docTime)) {
* try {
* // notify the barrier that this Thread is now
* // processing a document in the next time span. In
* // addition, enqueue the time for this document so
* // the serialization thread can reset the correct
* // s-sspace start time
* futureStartTimes.add(docTime, new Object());
* exceededTimeSpanBarrier.await();
* } catch (Exception ex) {
* // Handle exception here;
* }
* }
*
* try {
* fdTRI.processDocument(doc.reader());
* } catch (IOException ioe) {
* throw new IOError(ioe); // rethrow
* }
* }
* }
* };
*
* // Start threads and wait for processing to finish...
*
*
* Note that the requirements of an {@code OrderedTemporalRandomIndexing} class
* stipulate that the documents be processed in order. For this class, the
* documents must be in order according to their semantic partition. In addition,
* the first document seen for a semantic partition should be the earliest for that
* partition. This behavior is most easily accomplished by sorting the documents
* according to time stamp prior to processing the documents.
*
* @author David Jurgens
*/
public class FixedDurationTemporalRandomIndexing
extends OrderedTemporalRandomIndexing {
/**
* The default time span of one month to be used if no time span is
* specified using the {@value #SEMANTIC_PARTITION_DURATION_PROPERTY} property.
*/
public static final TimeSpan DEFAULT_SEMANTIC_PARTITION_DURATION =
new TimeSpan(0, 1, 0, 0, 0);
/**
* The prefix for naming public properties.
*/
private static final String PROPERTY_PREFIX =
"edu.ucla.sspace.tri.FixedDurationTemporalRandomIndexing";
/**
* The property to set duration of a semantic partition using a {@link TimeSpan}
* configuration string.
*/
public static final String SEMANTIC_PARTITION_DURATION_PROPERTY =
PROPERTY_PREFIX + ".partitionDuration";
/**
* The duration of a semantic partition.
*/
private final TimeSpan partitionDuration;
/**
* Creates an instance of {@code FixedDurationTemporalRandomIndexing} using
* the system properties to configure the behavior.
*
* @throws IllegalStateException if the {@value
* #SEMANTIC_PARTITION_DURATION_PROPERTY} property is not set
*/
public FixedDurationTemporalRandomIndexing() {
this(System.getProperties());
}
/**
* Creates an instance of {@code FixedDurationTemporalRandomIndexing} using
* the provided properties to configure the behavior.
*
* @param props the properties used to configure this instance
*
* @throws IllegalStateException if the {@value
* #SEMANTIC_PARTITION_DURATION_PROPERTY} property is not set
*/
public FixedDurationTemporalRandomIndexing(Properties props) {
super(props);
String timeSpanProp =
props.getProperty(SEMANTIC_PARTITION_DURATION_PROPERTY);
partitionDuration = (timeSpanProp == null)
? DEFAULT_SEMANTIC_PARTITION_DURATION
: new TimeSpan(timeSpanProp);
}
/**
* {@inheritDoc}
*/
public String getSpaceName() {
return PROPERTY_PREFIX + "-" + partitionDuration + "-" + getVectorLength();
}
/**
* Returns {@code true} if the time stamp for the next document would
* exceed the duration of the current semantic partition.
*
* @param timeStamp {@inheritDoc}
*
* @return {@code true} if the time stamp for the next document would exceed
* the duration of the current semantic partition
*/
protected boolean shouldPartitionSpace(long timeStamp) {
return !partitionDuration.insideRange(startTime, timeStamp);
}
}