es.uam.eps.ir.relison.diffusion.io.DataReader Maven / Gradle / Ivy
The newest version!
/*
* Copyright (C) 2020 Information Retrieval Group at Universidad Autónoma
* de Madrid, http://ir.ii.uam.es
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
package es.uam.eps.ir.relison.diffusion.io;
import es.uam.eps.ir.ranksys.core.util.FastStringSplitter;
import es.uam.eps.ir.relison.diffusion.data.Data;
import es.uam.eps.ir.relison.diffusion.data.Information;
import es.uam.eps.ir.relison.diffusion.simulation.SimulationEdgeTypes;
import es.uam.eps.ir.relison.graph.Graph;
import es.uam.eps.ir.relison.index.Index;
import es.uam.eps.ir.relison.index.Relation;
import es.uam.eps.ir.relison.index.fast.FastIndex;
import es.uam.eps.ir.relison.index.fast.FastWeightedPairwiseRelation;
import es.uam.eps.ir.relison.io.graph.GraphReader;
import es.uam.eps.ir.relison.io.graph.TextGraphReader;
import es.uam.eps.ir.relison.io.graph.TextMultiGraphReader;
import es.uam.eps.ir.relison.utils.datatypes.Triplet;
import es.uam.eps.ir.relison.utils.datatypes.Tuple2oo;
import org.ranksys.formats.parsing.Parser;
import org.ranksys.formats.rec.RecommendationFormat;
import org.ranksys.formats.rec.SimpleRecommendationFormat;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.IntStream;
import static org.ranksys.formats.parsing.Parsers.dp;
import static org.ranksys.formats.parsing.Parsers.lp;
/**
* Class for reading the data.
*
* @author Javier Sanz-Cruzado ([email protected])
* @author Pablo Castells ([email protected])
*
* @param type of the users.
* @param type of the information pieces.
* @param type of the user and information pieces features.
*/
public class DataReader
{
/**
* Read the data from files. Only the basic information is available.
* @param graphFile route of a file containing the network graph.
* @param multigraph true if the graph is a multigraph, false otherwise.
* @param directed true if the graph is directed, false otherwise.
* @param weighted true if the graph is weighted, false otherwise.
* @param selfloops true if the graph allows self loops, false otherwise.
* @param readtypes true if we must read the types of the edges, false otherwise.
* @param uIndex route to a file containing a list of user identifiers.
* @param iIndex route to a file containing a list of user identifiers.
* @param infoFile file containing information about the information pieces (creator and timestamps).
* @param uParser parser for the user identifiers.
* @param iParser parser for the information piece identifiers.
* @return the data object.
* @throws IOException if something fails while reading.
*/
public Data readData(String graphFile, boolean multigraph, boolean directed, boolean weighted, boolean selfloops, boolean readtypes, String uIndex, String iIndex, String infoFile, Parser uParser, Parser iParser) throws IOException
{
return this.readData(graphFile, multigraph, directed, weighted, selfloops, readtypes, uIndex, iIndex, infoFile, null, null, null, null, Integer.MAX_VALUE, uParser, iParser, null);
}
/**
* Read the data from files. We have information about the features of users or items.
* @param graphFile route of a file containing the network graph.
* @param multigraph true if the graph is a multigraph, false otherwise.
* @param directed true if the graph is directed, false otherwise.
* @param weighted true if the graph is weighted, false otherwise.
* @param selfloops true if the graph allows self loops, false otherwise.
* @param readtypes true if we have to read the types of the edges, false otherwise.
* @param uIndex route to a file containing a list of user identifiers.
* @param iIndex route to a file containing a list of user identifiers.
* @param infoFile file containing information about the information pieces (creator and timestamps).
* @param userFeatureFiles an array containing the routes of files user features (might be null).
* @param infoFeatureFiles an array containing the routes of files with information pieces features (might be null).
* @param uParser parser for the user identifiers.
* @param iParser parser for the information piece identifiers.
* @param pParser parser for the features.
* @return the data object.
* @throws IOException if something fails while reading.
*/
public Data readData(String graphFile, boolean multigraph, boolean directed, boolean weighted, boolean selfloops, boolean readtypes, String uIndex, String iIndex, String infoFile, String[] userFeatureFiles, String[] infoFeatureFiles, Parser uParser, Parser iParser, Parser pParser) throws IOException
{
return this.readData(graphFile, multigraph, directed, weighted, selfloops, readtypes, uIndex, iIndex, infoFile, userFeatureFiles, infoFeatureFiles, null, null, Integer.MAX_VALUE, uParser, iParser, pParser);
}
/**
* Read the data from files. We have information about which users propagated which information (and when)
* @param graphFile route of a file containing the network graph.
* @param multigraph true if the graph is a multigraph, false otherwise.
* @param directed true if the graph is directed, false otherwise.
* @param weighted true if the graph is weighted, false otherwise.
* @param selfloops true if the graph allows self loops, false otherwise.
* @param readtypes true if the graph has types that we should read. NOTE: if we use recommendations, this parameter will be considered as false.
* @param uIndex route to a file containing a list of user identifiers.
* @param iIndex route to a file containing a list of user identifiers.
* @param infoFile file containing information about the information pieces (creator and timestamps).
* @param realPropagatedFile route to a file containing information about which information pieces were repropagated by each user in the real diffusion (might be null).
* @param uParser parser for the user identifiers.
* @param iParser parser for the information piece identifiers.
* @return a data object to use in a diffusion simulation procedure.
* @throws IOException if something fails while reading.
*/
public Data readData(String graphFile, boolean multigraph, boolean directed, boolean weighted, boolean selfloops, boolean readtypes, String uIndex, String iIndex, String infoFile, String realPropagatedFile, Parser uParser, Parser iParser) throws IOException
{
return this.readData(graphFile, multigraph, directed, weighted, selfloops, readtypes, uIndex, iIndex, infoFile, null, null, realPropagatedFile, null, Integer.MAX_VALUE, uParser, iParser, null);
}
/**
* Read the data from files. We have information about which users propagated which information (and when) as well as information about the features of users or items.
* @param graphFile route of a file containing the network graph.
* @param multigraph true if the graph is a multigraph, false otherwise.
* @param directed true if the graph is directed, false otherwise.
* @param weighted true if the graph is weighted, false otherwise.
* @param selfloops true if the graph allows self loops, false otherwise.
* @param readtypes true if the graph has types that we should read. NOTE: if we use recommendations, this parameter will be considered as false.
* @param uIndex route to a file containing a list of user identifiers.
* @param iIndex route to a file containing a list of user identifiers.
* @param userFeatureFiles an array containing the routes of files user features (might be null).
* @param infoFeatureFiles an array containing the routes of files with information pieces features (might be null).
* @param infoFile file containing information about the information pieces (creator and timestamps).
* @param realPropagatedFile route to a file containing information about which information pieces were repropagated by each user in the real diffusion (might be null).
* @param uParser parser for the user identifiers.
* @param iParser parser for the information piece identifiers.
* @param pParser parser for the features.
* @return a data object to use in a diffusion simulation procedure.
* @throws IOException if something fails while reading.
*/
public Data readData(String graphFile, boolean multigraph, boolean directed, boolean weighted, boolean selfloops, boolean readtypes, String uIndex, String iIndex, String infoFile, String[] userFeatureFiles, String[] infoFeatureFiles, String realPropagatedFile, Parser uParser, Parser iParser, Parser pParser) throws IOException
{
return this.readData(graphFile, multigraph, directed, weighted, selfloops, readtypes, uIndex, iIndex, infoFile, userFeatureFiles, infoFeatureFiles, realPropagatedFile, null, Integer.MAX_VALUE, uParser, iParser, pParser);
}
/**
* Read the data from files. Only the basic information is available, in addition to a contact recommendation.
* @param graphFile route of a file containing the network graph.
* @param multigraph true if the graph is a multigraph, false otherwise.
* @param directed true if the graph is directed, false otherwise.
* @param weighted true if the graph is weighted, false otherwise.
* @param selfloops true if the graph allows self loops, false otherwise.
* @param uIndex route to a file containing a list of user identifiers.
* @param iIndex route to a file containing a list of user identifiers.
* @param infoFile file containing information about the information pieces (creator and timestamps).
* @param recFile file containing the results of applying a contact recommendation algorithm over the network (might be null).
* @param topN number of links (per user) to add from the recommendations.
* @param uParser parser for the user identifiers.
* @param iParser parser for the information piece identifiers.
* @return the data object.
* @throws IOException if something fails while reading.
*/
public Data readData(String graphFile, boolean multigraph, boolean directed, boolean weighted, boolean selfloops, String uIndex, String iIndex, String infoFile, String recFile, int topN, Parser uParser, Parser iParser) throws IOException
{
return this.readData(graphFile, multigraph, directed, weighted, selfloops, false, uIndex, iIndex, infoFile, null, null, null, recFile, topN, uParser, iParser, null);
}
/**
* Read the data from files. We have information about the features of users or items, in addition to a contact recommendation.
* @param graphFile route of a file containing the network graph.
* @param multigraph true if the graph is a multigraph, false otherwise.
* @param directed true if the graph is directed, false otherwise.
* @param weighted true if the graph is weighted, false otherwise.
* @param selfloops true if the graph allows self loops, false otherwise.
* @param uIndex route to a file containing a list of user identifiers.
* @param iIndex route to a file containing a list of user identifiers.
* @param infoFile file containing information about the information pieces (creator and timestamps).
* @param userFeatureFiles an array containing the routes of files user features (might be null).
* @param infoFeatureFiles an array containing the routes of files with information pieces features (might be null).
* @param recFile file containing the results of applying a contact recommendation algorithm over the network (might be null).
* @param topN number of links (per user) to add from the recommendations.
* @param uParser parser for the user identifiers.
* @param iParser parser for the information piece identifiers.
* @param pParser parser for the features.
* @return the data object.
* @throws IOException if something fails while reading.
*/
public Data readData(String graphFile, boolean multigraph, boolean directed, boolean weighted, boolean selfloops, String uIndex, String iIndex, String infoFile, String[] userFeatureFiles, String[] infoFeatureFiles, String recFile, int topN, Parser uParser, Parser iParser, Parser pParser) throws IOException
{
return this.readData(graphFile, multigraph, directed, weighted, selfloops, false, uIndex, iIndex, infoFile, userFeatureFiles, infoFeatureFiles, null, recFile, topN, uParser, iParser, pParser);
}
/**
* Read the data from files. We have information about which users propagated which information (and when), in addition to a contact recommendation.
* @param graphFile route of a file containing the network graph.
* @param multigraph true if the graph is a multigraph, false otherwise.
* @param directed true if the graph is directed, false otherwise.
* @param weighted true if the graph is weighted, false otherwise.
* @param selfloops true if the graph allows self loops, false otherwise.
* @param uIndex route to a file containing a list of user identifiers.
* @param iIndex route to a file containing a list of user identifiers.
* @param infoFile file containing information about the information pieces (creator and timestamps).
* @param realPropagatedFile route to a file containing information about which information pieces were repropagated by each user in the real diffusion (might be null).
* @param recFile file containing the results of applying a contact recommendation algorithm over the network (might be null).
* @param topN number of links (per user) to add from the recommendations.
* @param uParser parser for the user identifiers.
* @param iParser parser for the information piece identifiers.
* @return a data object to use in a diffusion simulation procedure.
* @throws IOException if something fails while reading.
*/
public Data readData(String graphFile, boolean multigraph, boolean directed, boolean weighted, boolean selfloops, String uIndex, String iIndex, String infoFile, String realPropagatedFile, String recFile, int topN, Parser uParser, Parser iParser) throws IOException
{
return this.readData(graphFile, multigraph, directed, weighted, selfloops, false, uIndex, iIndex, infoFile, null, null, realPropagatedFile, recFile, topN, uParser, iParser, null);
}
/**
* Read the data from files. We have information about which users propagated which information (and when) as well as information about the features of users or items,
* as well as a contact recommendation.
* @param graphFile route of a file containing the network graph.
* @param multigraph true if the graph is a multigraph, false otherwise.
* @param directed true if the graph is directed, false otherwise.
* @param weighted true if the graph is weighted, false otherwise.
* @param selfloops true if the graph allows self loops, false otherwise.
* @param uIndex route to a file containing a list of user identifiers.
* @param iIndex route to a file containing a list of user identifiers.
* @param userFeatureFiles an array containing the routes of files user features (might be null).
* @param infoFeatureFiles an array containing the routes of files with information pieces features (might be null).
* @param infoFile file containing information about the information pieces (creator and timestamps).
* @param realPropagatedFile route to a file containing information about which information pieces were repropagated by each user in the real diffusion (might be null).
* @param recFile file containing the results of applying a contact recommendation algorithm over the network (might be null).
* @param topN number of links (per user) to add from the recommendations.
* @param uParser parser for the user identifiers.
* @param iParser parser for the information piece identifiers.
* @param pParser parser for the features.
* @return a data object to use in a diffusion simulation procedure.
* @throws IOException if something fails while reading.
*/
public Data readData(String graphFile, boolean multigraph, boolean directed, boolean weighted, boolean selfloops, String uIndex, String iIndex, String infoFile, String[] userFeatureFiles, String[] infoFeatureFiles, String realPropagatedFile, String recFile, int topN, Parser uParser, Parser iParser, Parser pParser) throws IOException
{
return this.readData(graphFile, multigraph, directed, weighted, selfloops, false, uIndex, iIndex, infoFile, userFeatureFiles, infoFeatureFiles, realPropagatedFile, null, Integer.MAX_VALUE, uParser, iParser, pParser);
}
/**
* Read the data from files.
* @param graphFile route of a file containing the network graph.
* @param multigraph true if the graph is a multigraph, false otherwise.
* @param directed true if the graph is directed, false otherwise.
* @param weighted true if the graph is weighted, false otherwise.
* @param selfloops true if the graph allows self loops, false otherwise.
* @param readtypes true if the graph has types that we should read. NOTE: if we use recommendations, this parameter will be considered as false.
* @param uIndex route to a file containing a list of user identifiers.
* @param iIndex route to a file containing a list of user identifiers.
* @param infoFile file containing information about the information pieces (creator and timestamps).
* @param userFeatureFiles an array containing the routes of files user features (might be null).
* @param infoFeatureFiles an array containing the routes of files with information pieces features (might be null).
* @param realPropagatedFile route to a file containing information about which information pieces were repropagated by each user in the real diffusion (might be null).
* @param recFile file containing the results of applying a contact recommendation algorithm over the network (might be null).
* @param topN number of links (per user) to add from the recommendations.
* @param uParser parser for the user identifiers.
* @param iParser parser for the information piece identifiers.
* @param pParser parser for the feature values.
* @return a data object to use in a diffusion simulation procedure.
* @throws IOException if something fails while reading.
*/
protected Data readData(String graphFile, boolean multigraph, boolean directed, boolean weighted, boolean selfloops, boolean readtypes, String uIndex, String iIndex, String infoFile, String[] userFeatureFiles, String[] infoFeatureFiles, String realPropagatedFile, String recFile, int topN, Parser uParser, Parser iParser, Parser pParser) throws IOException
{
// We first read the user and information pieces index.
Index userIndex = this.readIndex(uIndex, uParser);
Index infoIndex = this.readIndex(iIndex, iParser);
// Then, we read the graph:
GraphReader greader = multigraph ? new TextMultiGraphReader<>(directed, weighted, selfloops, "\t", uParser) : new TextGraphReader<>(directed, weighted, selfloops, "\t", uParser);
Graph graph = greader.read(graphFile, weighted, recFile == null && readtypes);
// We add to the graph all the recommendation edges.
if(recFile != null)
{
RecommendationFormat format = new SimpleRecommendationFormat<>(uParser, uParser);
format.getReader(recFile).readAll().forEach(rec ->
{
U u = rec.getUser();
rec.getItems().stream().limit(topN).forEach(r -> graph.addEdge(u, r.v1, 1.0, SimulationEdgeTypes.RECOMMEND));
});
}
// We read the data about the information (i.e. the creators and timestamps of the information pieces)
Tuple2oo