All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.ucla.sspace.index.RandomIndexVectorGenerator Maven / Gradle / Ivy

Go to download

The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.

The newest version!
/*
 * Copyright 2009 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.index;

import edu.ucla.sspace.vector.TernaryVector;
import edu.ucla.sspace.vector.IntegerVector;

import java.io.Serializable;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Iterator;
import java.util.Properties;
import java.util.Random;
import java.util.Set;


/**
 * An class that generates {@link RandomTernaryVector} instances based on
 * configurable properties.  This class supports two properties:
 *
 * 
* *
Property: {@value #VALUES_TO_SET_PROPERTY} *
* Default: {@value #DEFAULT_INDEX_VECTOR_VALUES} * *
This variable sets the number of bits to set in * an index vector.

* *

Property: {@value #INDEX_VECTOR_VARIANCE_PROPERTY} *
* Default: {@value #DEFAULT_INDEX_VECTOR_VARIANCE} * *
This variable sets the variance in the number * of bits to set in an index vector. For example, having {@value * #VALUES_TO_SET_PROPERTY}{@code =4} and setting this property to {@code * 2} would mean that {@code 4 ± 2} value would be randomly set in * each index vector.

* *

*/ public class RandomIndexVectorGenerator implements IntegerVectorGenerator, Serializable { private static final long serialVersionUID = 1L; /** * A random number generator that can be accessed to other classes which * will rely on the same source of random values. */ public static final Random RANDOM = new Random(); /** * The prefix for naming public properties. */ private static final String PROPERTY_PREFIX = "edu.ucla.sspace.index.RandomIndexVectorGenerator"; /** * The property to specify the number of values to set in an {@link * TernaryVector}. */ public static final String VALUES_TO_SET_PROPERTY = PROPERTY_PREFIX + ".values"; /** * The property to specify the variance in the number of values to set in an * {@link TernaryVector}. */ public static final String INDEX_VECTOR_VARIANCE_PROPERTY = PROPERTY_PREFIX + ".variance"; /** * The default number of values to set in an {@link TernaryVector}. */ public static final int DEFAULT_INDEX_VECTOR_VALUES = 4; /** * The default number of dimensions to create in each {@code TernaryVector}. */ public static final int DEFAULT_INDEX_VECTOR_LENGTH = 20000; /** * The default random variance in the number of values that are set in an * {@code TernaryVector}. */ public static final int DEFAULT_INDEX_VECTOR_VARIANCE = 0; /** * The number of values to set in an {@link TernaryVector}. */ private int numVectorValues; /** * The variance in the number of values that are set in an {@code * TernaryVector}. */ private int variance; private int indexVectorLength; /** * Constructs this instance using the system properties. */ public RandomIndexVectorGenerator(int indexVectorLength) { this(indexVectorLength, System.getProperties()); } /** * Constructs this instance using the provided properties. */ public RandomIndexVectorGenerator(int indexVectorLength, Properties properties) { this.indexVectorLength = indexVectorLength; String numVectorValuesProp = properties.getProperty(VALUES_TO_SET_PROPERTY); numVectorValues = (numVectorValuesProp != null) ? Integer.parseInt(numVectorValuesProp) : DEFAULT_INDEX_VECTOR_VALUES; String varianceProp = properties.getProperty(INDEX_VECTOR_VARIANCE_PROPERTY); variance = (varianceProp != null) ? Integer.parseInt(varianceProp) : DEFAULT_INDEX_VECTOR_VARIANCE; } /** * Creates an {@code TernaryVector} with the provided length. * * @param length the length of the index vector * * @return an index vector */ public TernaryVector generate() { HashSet pos = new HashSet(); HashSet neg = new HashSet(); // Randomly decide how many bits to set in the index vector based on the // variance. int bitsToSet = numVectorValues + (int)(RANDOM.nextDouble() * variance * ((RANDOM.nextDouble() > .5) ? 1 : -1)); for (int i = 0; i < bitsToSet; ++i) { boolean picked = false; // loop to ensure we actually pick the full number of bits while (!picked) { // pick some random index int index = RANDOM.nextInt(indexVectorLength); // check that we haven't already added this index if (pos.contains(index) || neg.contains(index)) continue; // decide positive or negative ((RANDOM.nextDouble() > .5) ? pos : neg).add(index); picked = true; } } int[] positive = new int[pos.size()]; int[] negative = new int[neg.size()]; Iterator it = pos.iterator(); for (int i = 0; i < positive.length; ++i) positive[i] = it.next(); it = neg.iterator(); for (int i = 0; i < negative.length; ++i) negative[i] = it.next(); // sort so we can use a binary search in getValue() Arrays.sort(positive); Arrays.sort(negative); return new TernaryVector(indexVectorLength, positive, negative); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy