boofcv.alg.scene.ann.RecognitionNearestNeighborInvertedFile Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of boofcv-recognition Show documentation
Show all versions of boofcv-recognition Show documentation
BoofCV is an open source Java library for real-time computer vision and robotics applications.
/*
* Copyright (c) 2022, Peter Abeles. All Rights Reserved.
*
* This file is part of BoofCV (http://boofcv.org).
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package boofcv.alg.scene.ann;
import boofcv.alg.scene.bow.BowDistanceTypes;
import boofcv.alg.scene.bow.BowMatch;
import boofcv.alg.scene.bow.BowUtils;
import boofcv.alg.scene.bow.InvertedFile;
import boofcv.alg.scene.nister2006.TupleMapDistanceNorm;
import boofcv.misc.BoofLambdas;
import lombok.Getter;
import lombok.Setter;
import org.ddogleg.nn.NearestNeighbor;
import org.ddogleg.nn.NnData;
import org.ddogleg.struct.*;
import org.jetbrains.annotations.Nullable;
import java.io.PrintStream;
import java.util.List;
import java.util.Set;
/**
* Implementation of the "classical" Bog-Of-Words (BOW) (a.k.a. Bag-Of-Visual-Words) [1] for object/scene recognition
* that uses an inverted file for fast image retrieval [2].
*
* An image is described using a set of local image features (e.g. SIFT) which results in a set of n-dimensional
* vectors. Each feature vector is converted into a word, which is then used to build a histogram of words in the
* image. A similarity score is computed between two images using the histogram. Words are learned using k-means
* clustering when applied to a large initial training set of image features.
*
* This implementation is designed to be simple and flexible. Allowing different algorithms in the same family
* to be swapped out. For example, the nearest-neighbor (NN) search can be done using a brute force approach, kd-tree,
* or an approximate kd-tree.
*
* There is no single source for this specific paper that inspired this implementation and it borrows ideas from
* several papers. The paper below is one of the earlier works to discuss the concept for visual BOW.
*
* - Sivic, Josef, and Andrew Zisserman. "Video Google: A text retrieval approach to object matching in videos."
* Computer Vision, IEEE International Conference on. Vol. 3. IEEE Computer Society, 2003.
* - Nister, David, and Henrik Stewenius. "Scalable recognition with a vocabulary tree."
* 2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'06). Vol. 2. Ieee, 2006.
*
*
* @param Data type for the 'point'. Typically this is a Tuple.
* @author Peter Abeles
*/
@SuppressWarnings({"NullAway.Init"})
public class RecognitionNearestNeighborInvertedFile implements VerbosePrint {
/** A nearest-neighbor search to look up the closest fit to each word */
protected @Getter NearestNeighbor nearestNeighbor;
/** Distance between two TF-IDF descriptors. L1 and L2 norms are provided */
protected @Getter @Setter TupleMapDistanceNorm distanceFunction = new TupleMapDistanceNorm.L2();
/** List of images added to the database */
protected @Getter final BigDogArray_I32 imagesDB = new BigDogArray_I32(100, 10_000, BigDogGrowth.GROW_FIRST);
/** List of all images the query was found to be similar/matched with */
@Getter DogArray matches = new DogArray<>(BowMatch::new, BowMatch::reset);
/** List of images in the DB that are observed by each word. One element per word. */
@Getter DogArray invertedFiles = new DogArray<>(InvertedFile::new, InvertedFile::reset);
//--------------------------- Internal Work Space
// Used to search for matching words
public NearestNeighbor.Search search;
public final NnData searchResult = new NnData<>();
// Look up table from image to BowMatch. All values but be set to -1 after use
// The size of this array will be the same as the number of DB images
DogArray_I32 imageIdx_to_match = new DogArray_I32();
// Histogram for the number of times each word appears. All values must be 0 initially
// One element for each word
DogArray_I32 wordHistogram = new DogArray_I32();
// List of words which were observed
public DogArray_I32 observedWords = new DogArray_I32();
// temporary storage for an image TF-IDF descriptor
DogArray_F32 tmpDescWeights = new DogArray_F32();
// If not null then print verbose information here
@Nullable PrintStream verbose;
/**
* Initializes the data structures.
*
* @param nearestNeighbor Search used to find the words.
* @param numWords Number of words
*/
public void initialize( NearestNeighbor nearestNeighbor, int numWords ) {
this.nearestNeighbor = nearestNeighbor;
invertedFiles.resize(numWords);
imagesDB.reset();
wordHistogram.reset().resize(numWords, 0);
this.search = nearestNeighbor.createSearch();
}
/**
* Discards all memory of words which were added
*/
public void clearImages() {
imagesDB.reset();
// Clear the inverted files list. This will force all elements to be reset
int numWords = invertedFiles.size;
invertedFiles.reset();
invertedFiles.resize(numWords);
}
/**
* Adds a new image to the database.
*
* @param imageID The image's unique ID for later reference
* @param imageFeatures Feature descriptors from an image
*/
public void addImage( int imageID, List imageFeatures ) {
if (imageFeatures.isEmpty())
return;
int imageIdx = imagesDB.size;
imagesDB.append(imageID);
computeWordHistogram(imageFeatures);
computeImageDescriptor(imageFeatures.size());
// Add this image to the inverted file for each word
for (int i = 0; i < observedWords.size; i++) {
int word = observedWords.get(i);
invertedFiles.get(word).addImage(imageIdx, tmpDescWeights.get(i));
}
}
/**
* Computes the number of times each word appears in the list of features
*/
void computeWordHistogram( List imageFeatures ) {
// Find and count the number of times each word appears in this set of features
observedWords.reset();
for (int featureIdx = 0; featureIdx < imageFeatures.size(); featureIdx++) {
if (!search.findNearest(imageFeatures.get(featureIdx), -1, searchResult))
continue;
int count = wordHistogram.data[searchResult.index];
wordHistogram.data[searchResult.index] = count + 1;
if (count == 0) {
observedWords.add(searchResult.index);
}
}
}
/**
* Given the image histogram, compute the TF-IDF descriptor
*
* @param totalUniqueWordsSeenByImage Number of features in this image
*/
void computeImageDescriptor( float totalUniqueWordsSeenByImage ) {
// Compute the weight for each word in the descriptor based on its frequency
tmpDescWeights.reset();
for (int i = 0; i < observedWords.size; i++) {
int word = observedWords.get(i);
// Term frequency: n[i] = number of times word[i] appears in this image / total words in this image
float termFrequency = wordHistogram.get(word)/totalUniqueWordsSeenByImage;
tmpDescWeights.add(termFrequency);
// make sure the histogram is full of zeros again
wordHistogram.set(word, 0);
}
// Normalize the image descriptor
distanceFunction.normalize(tmpDescWeights);
}
/**
* Looks up the best BowMatch from the database. The list of all potential matches can be accessed by calling
* {@link #getMatches()}.
*
* @param queryImage Set of feature descriptors from the query image
* @param filter Filter which can be used to reject matches that the user doesn't want returned. False = reject.
* @param limit Maximum number of matches it will return.
* @return The best matching image with score from the database
*/
public boolean query( List queryImage, @Nullable BoofLambdas.FilterInt filter, int limit ) {
matches.reset();
// Can't BowMatch to anything if it's empty
if (queryImage.isEmpty()) {
return false;
}
computeWordHistogram(queryImage);
computeImageDescriptor(queryImage.size());
findAndScoreMatches();
if (matches.isEmpty())
return false;
if (verbose != null) verbose.println("raw matches.size=" + matches.size);
// Compute the score for each candidate and other book keeping
for (int candidateIter = 0; candidateIter < matches.size; candidateIter++) {
BowMatch c = matches.get(candidateIter);
// Ensure this array is once again full of -1
imageIdx_to_match.set(c.identification, -1);
// convert it from image index into the user provided ID number
c.identification = imagesDB.get(c.identification);
}
BowUtils.filterAndSortMatches(matches, filter, limit);
return matches.size > 0;
}
/**
* Finds all the matches using the observed words and the inverted files.
*/
void findAndScoreMatches() {
// This will always be filled with -1 initially, resize will just set new elements to -1
imageIdx_to_match.resize(imagesDB.size, -1);
// Create a list of all candidate images in the DB
matches.reset();
for (int wordIdx = 0; wordIdx < observedWords.size; wordIdx++) {
float queryWordWeight = tmpDescWeights.get(wordIdx);
int word = observedWords.get(wordIdx);
InvertedFile invertedFile = invertedFiles.get(word);
// Go through the inverted file list
final int N = invertedFile.weights.size;
for (int invertedIdx = 0; invertedIdx < N; invertedIdx++) {
int imageIdx = invertedFile.get(invertedIdx);
// See if this DB image has been seen before
BowMatch m;
int matchIdx = imageIdx_to_match.get(imageIdx);
if (matchIdx == -1) {
// It has not been seen before, create a new entry for it in the candidate list
imageIdx_to_match.set(imageIdx, matches.size);
m = matches.grow();
m.identification = imageIdx; // this will be converted to ID on output
} else {
m = matches.get(matchIdx);
}
// Update the score computation. See TupleMapDistanceNorm for why this is done
m.error += distanceFunction.distanceUpdate(queryWordWeight, invertedFile.weights.get(invertedIdx));
}
}
}
/** Used to change distance function to one of the built in types */
public void setDistanceType( BowDistanceTypes type ) {
distanceFunction = switch (type) {
case L1 -> new TupleMapDistanceNorm.L1();
case L2 -> new TupleMapDistanceNorm.L2();
default -> throw new IllegalArgumentException("Unknown type " + type);
};
}
@Override public void setVerbose( @Nullable PrintStream out, @Nullable Set configuration ) {
this.verbose = out;
}
}