com.expleague.ml.models.gpf.weblogmodel.WebLogV1GPFSession Maven / Gradle / Ivy
package com.expleague.ml.models.gpf.weblogmodel;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;
import com.expleague.ml.models.gpf.GPFLinearModel;
import com.expleague.ml.models.gpf.GPFModel;
import com.expleague.ml.models.gpf.Session;
import com.google.gson.*;
/**
* User: irlab
* Date: 22.05.14
*/
public class WebLogV1GPFSession {
public static List> loadDatasetFromJSON(final InputStream is, final GPFModel model, final int rows_limit) throws IOException {
final List> dataset = new ArrayList<>();
final LineNumberReader lnr = new LineNumberReader(new InputStreamReader(is, "UTF8"));
final Gson gson = new Gson();
final Gson gson_prettyprint = new GsonBuilder().setPrettyPrinting().create();
for (String line = lnr.readLine(); line != null; line = lnr.readLine()) {
if (rows_limit > 0 && dataset.size() >= rows_limit)
break;
final String[] split = line.split("\t");
final String json_ses_str = split[2];
final JsonSes ses = gson.fromJson(json_ses_str, JsonSes.class);
final BlockV1[] blocks = new BlockV1[ses.sntypes.length];
for (int i = 0; i < blocks.length; i++) {
blocks[i] = new BlockV1(
Session.BlockType.RESULT,
BlockV1.ResultType.valueOf(ses.sntypes[i]),
i,
BlockV1.ResultGrade.valueOf(ses.rel[i]));
}
final String source_string = gson_prettyprint.toJson(ses);
final Session session = new Session(ses.uid, ses.reqid, ses.user_region, ses.query, source_string);
setSessionData(session, blocks, ses.clicks);
dataset.add(session);
}
return dataset;
}
/**
* this function sets up the structure of a Session: a set of vertices (blocks and virtual blocks), and a set of edges
* @param ses - a Session to set up (write-only)
* @param result_blocks - a set of 'real' (observed) blocks (read-only)
* @param clicks_block_indexes - list of clicks (clicks_block_indexes[i] is a i'th click on result_blocks[clicks_block_indexes[i]])
*/
public static void setSessionData(final Session ses, final BlockV1[] result_blocks, final int[] clicks_block_indexes) {
// init blocks
final BlockV1[] blocks = new BlockV1[result_blocks.length + Session.R0_INDEX];
// int[] result_pos2block_ind = new int[100];
int max_result_pos = -1;
int min_result_pos = 1000;
blocks[Session.Q_INDEX] = new BlockV1(Session.BlockType.Q, -1);
blocks[Session.S_INDEX] = new BlockV1(Session.BlockType.S, -1);
blocks[Session.E_INDEX] = new BlockV1(Session.BlockType.E, -1);
for (int i = 0; i < result_blocks.length; i++) {
blocks[i + Session.R0_INDEX] = result_blocks[i];
max_result_pos = Math.max(max_result_pos, result_blocks[i].position);
min_result_pos = Math.min(min_result_pos, result_blocks[i].position);
}
ses.setBlocks(blocks);
final int[] click_indexes = new int[clicks_block_indexes.length];
for (int i = 0; i < click_indexes.length; i++)
click_indexes[i] = clicks_block_indexes[i] + Session.R0_INDEX;
ses.setClick_indexes(click_indexes);
// init edges
final List edges = new ArrayList<>();
for (int i = Session.R0_INDEX; i < blocks.length; i++) {
// R_i -> R_{i+1}
if (i + 1 < blocks.length)
edges.add(new Session.Edge(i, i+1));
// R_i -> R_{i-1}
if (i > Session.R0_INDEX)
edges.add(new Session.Edge(i, i-1));
// Q -> R_i
edges.add(new Session.Edge(Session.Q_INDEX, i));
// S -> R_i
edges.add(new Session.Edge(Session.S_INDEX, i));
// R_i -> S
edges.add(new Session.Edge(i, Session.S_INDEX));
// R_i -> E
edges.add(new Session.Edge(i, Session.E_INDEX));
// E -> E
edges.add(new Session.Edge(Session.E_INDEX, Session.E_INDEX));
}
ses.setEdges(edges);
}
// {
// "__module__": "gpf_learn",
// "uid": "y1495086881377522622",
// "timestamp": 1377789431,
// "rel": ["RELEVANT_PLUS", "RELEVANT_PLUS", "RELEVANT_PLUS", "NOT_ASED", "NOT_ASED", "RELEVANT_PLUS", "RELEVANT_PLUS", "RELEVANT_PLUS", "RELEVANT_MINUS", "NOT_ASED", "RELEVANT_PLUS"],
// "reqid": "1377789430798872-1404107670513402777522981-ws38-714",
// "user_region": 213,
// "query": "строение человека",
// "__name__": "Session",
// "sntypes": ["WEB", "WEB", "WEB", "IMAGES", "WEB", "WEB", "WEB", "WEB", "WEB", "WEB", "WEB"],
// "clicks": [0, 3],
// "clicks_dwelltime": [3, 2]
// }
public static class JsonSes {
String __module__;
String uid;
long timestamp;
String[] rel;
String reqid;
int user_region;
String query;
String __name__;
String[] sntypes;
int[] clicks;
long[] clicks_dwelltime;
public JsonSes() {
}
}
public static void main(final String[] args) throws IOException {
// test
final long t1 = System.currentTimeMillis();
final List> dataset = loadDatasetFromJSON(new GZIPInputStream(WebLogV1GPFSession.class.getResourceAsStream("ses_100k_simple_rand1.dat.gz")), new GPFLinearModel(), 0);
System.out.println("dataset size: " + dataset.size());
System.out.println("time: " + (System.currentTimeMillis() - t1));
System.out.println("dataset[0]: " + dataset.get(0));
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy