All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.preprocessing.DocumentAssigner Maven / Gradle / Ivy

Go to download

Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).

There is a newer version: 3.16.3
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2019, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing;

import java.util.ArrayList;

import org.carrot2.core.attribute.Processing;
import org.carrot2.text.analysis.TokenTypeUtils;
import org.carrot2.text.preprocessing.PreprocessingContext.AllLabels;
import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.IntRange;

import com.carrotsearch.hppc.BitSet;
import com.carrotsearch.hppc.IntArrayList;
import org.carrot2.shaded.guava.common.collect.Lists;

/**
 * Assigns document to label candidates. For each label candidate from
 * {@link AllLabels#featureIndex} an {@link BitSet} with the assigned documents is
 * constructed. The assignment algorithm is rather simple: in order to be assigned to a
 * label, a document must contain at least one occurrence of each non-stop word from the
 * label.
 * 

* This class saves the following results to the {@link PreprocessingContext} : *

    *
  • {@link AllLabels#documentIndices}
  • *
*

* This class requires that {@link Tokenizer}, {@link CaseNormalizer}, * {@link StopListMarker}, {@link PhraseExtractor} and {@link LabelFilterProcessor} be * invoked first. */ @Bindable(prefix = "DocumentAssigner") public class DocumentAssigner { /** * Only exact phrase assignments. Assign only documents that contain the label in its * original form, including the order of words. Enabling this option will cause less * documents to be put in clusters, which result in higher precision of assignment, * but also a larger "Other Topics" group. Disabling this option will cause more * documents to be put in clusters, which will make the "Other Topics" cluster * smaller, but also lower the precision of cluster-document assignments. */ @Input @Processing @Attribute @Label("Exact phrase assignment") @Level(AttributeLevel.MEDIUM) @Group(DefaultGroups.PREPROCESSING) public boolean exactPhraseAssignment = false; /** * Determines the minimum number of documents in each cluster. */ @Input @Processing @Attribute @IntRange(min = 1, max = 100) @Label("Minimum cluster size") @Level(AttributeLevel.MEDIUM) @Group(DefaultGroups.PREPROCESSING) public int minClusterSize = 2; /** * Assigns document to label candidates. */ public void assign(PreprocessingContext context) { final int [] labelsFeatureIndex = context.allLabels.featureIndex; final int [][] stemsTfByDocument = context.allStems.tfByDocument; final int [] wordsStemIndex = context.allWords.stemIndex; final short [] wordsTypes = context.allWords.type; final int [][] phrasesTfByDocument = context.allPhrases.tfByDocument; final int [][] phrasesWordIndices = context.allPhrases.wordIndices; final int wordCount = wordsStemIndex.length; final int documentCount = context.documents.size(); final BitSet [] labelsDocumentIndices = new BitSet [labelsFeatureIndex.length]; for (int i = 0; i < labelsFeatureIndex.length; i++) { final BitSet documentIndices = new BitSet(documentCount); final int featureIndex = labelsFeatureIndex[i]; if (featureIndex < wordCount) { addTfByDocumentToBitSet(documentIndices, stemsTfByDocument[wordsStemIndex[featureIndex]]); } else { final int phraseIndex = featureIndex - wordCount; if (exactPhraseAssignment) { addTfByDocumentToBitSet(documentIndices, phrasesTfByDocument[phraseIndex]); } else { final int [] wordIndices = phrasesWordIndices[phraseIndex]; boolean firstAdded = false; for (int j = 0; j < wordIndices.length; j++) { final int wordIndex = wordIndices[j]; if (!TokenTypeUtils.isCommon(wordsTypes[wordIndex])) { if (!firstAdded) { addTfByDocumentToBitSet(documentIndices, stemsTfByDocument[wordsStemIndex[wordIndex]]); firstAdded = true; } else { final BitSet temp = new BitSet(documentCount); addTfByDocumentToBitSet(temp, stemsTfByDocument[wordsStemIndex[wordIndex]]); // .retainAll == set intersection documentIndices.and(temp); } } } } } labelsDocumentIndices[i] = documentIndices; } // Filter out labels that do not meet the minimum cluster size if (minClusterSize > 1) { final IntArrayList newFeatureIndex = new IntArrayList( labelsFeatureIndex.length); final ArrayList newDocumentIndices = Lists .newArrayListWithExpectedSize(labelsFeatureIndex.length); for (int i = 0; i < labelsFeatureIndex.length; i++) { if (labelsDocumentIndices[i].cardinality() >= minClusterSize) { newFeatureIndex.add(labelsFeatureIndex[i]); newDocumentIndices.add(labelsDocumentIndices[i]); } } context.allLabels.documentIndices = newDocumentIndices .toArray(new BitSet [newDocumentIndices.size()]); context.allLabels.featureIndex = newFeatureIndex.toArray(); LabelFilterProcessor.updateFirstPhraseIndex(context); } else { context.allLabels.documentIndices = labelsDocumentIndices; } } private static void addTfByDocumentToBitSet(final BitSet documentIndices, final int [] tfByDocument) { for (int j = 0; j < tfByDocument.length / 2; j++) { documentIndices.set(tfByDocument[j * 2]); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy