All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.clementlevallois.utils.TfIdf Maven / Gradle / Ivy

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package net.clementlevallois.utils;

import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;

/**
 *
 * @author LEVALLOIS
 */
public class TfIdf {

    /**
     *
     * @param categoriesToTerms
     * @return
     */
    public static Map> calculateTopTermsPerCategory(Map> categoriesToTerms) {

        Map> result = new TreeMap();
        
        Multiset termsCount = new Multiset();
        for (Entry> entry : categoriesToTerms.entrySet()) {
            termsCount.addAllFromMultiset(entry.getValue());
        }

        Multiset termsPerCategory;
        Map termsPerCategoryAndTheirRelativeFrequencies;
        for (String category : categoriesToTerms.keySet()) {
            termsPerCategory = categoriesToTerms.get(category);
            termsPerCategoryAndTheirRelativeFrequencies = new HashMap();
            for (Entry entry : termsPerCategory.getEntrySet()) {
                int countTermInThisCategory = entry.getValue();
                // count of the word in this doc  / Max (1, count total du term across docs - count du term in this doc)
                float relativeFrequency = (float) (Math.pow(countTermInThisCategory, 1.5) / Math.max(1, termsCount.getCount(entry.getKey()) - countTermInThisCategory));
                termsPerCategoryAndTheirRelativeFrequencies.put(entry.getKey(), relativeFrequency);
            }
            result.put(category, termsPerCategoryAndTheirRelativeFrequencies);
        }

        return result;

    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy