All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.Retriever.Document Maven / Gradle / Ivy

The newest version!
package io.github.repir.Retriever;

import io.github.repir.tools.io.buffer.BufferDelayedWriter;
import io.github.repir.tools.io.struct.StructureReader;
import io.github.repir.tools.io.struct.StructureWriter;
import io.github.repir.tools.lib.Log;
import io.github.repir.Repository.ReportableFeature;
import io.github.repir.Strategy.RetrievalModel;
import io.github.repir.tools.io.buffer.BufferSerializable;
import io.github.repir.tools.io.EOCException;

/**
 * Data class to contain values of retrieved documents. A Repository is assumed
 * to consist of entities that can be analyzed, we use Document to represent 
 * these entities, but the same generic mechanism can probably be used for other
 * types of entities, e.g. images, videos. That stored remains bound to these
 * Documents, during a retrieval task Documents can be retrieved with the 
 * feature values it is associated with, e.g. title, url, collectionid, size. 
 * 

* Documents are identified internally buy a unique (docid, partition), where document * is a sequence that is unique only within a partition. The Document itself * is more a data class without logic, its feature values are managed by the * features. */ public class Document implements BufferSerializable { public static Log log = new Log(Document.class); public RetrievalModel retrievalmodel; /** * partition/partition where the document is stored */ public int partition; /** * internal document id, unique per partition */ public int docid; /** * Score assigned to the document for ranking */ public double score; /** * For debug purposes, this is often used to mapOutput each storedfeatures * attribution to the score this is not communicated through the MapReduce * framework */ public StringBuilder report; public Object reportdata[]; private String collectionID; public Document() { } public Document(String collectionid) { this.collectionID = collectionid; } public Document(int docid, int partition) { this.docid = docid; this.partition = partition; } public Document(RetrievalModel rm, int docid, int partition) { this(docid, partition); setRetrievalModel(rm); } public void setRetrievalModel(RetrievalModel rm) { this.retrievalmodel = rm; if (reportdata == null) reportdata = new Object[retrievalmodel.getReportedFeaturesMap().size()]; } public String getCollectionID() { if (collectionID == null && retrievalmodel != null) collectionID = retrievalmodel.repository.getCollectionIDFeature().valueReported(this, 0); return collectionID; } public void setCollectionID(String collectionid) { collectionID = collectionid; } public void decode() { for (ReportedFeature f : retrievalmodel.getReportableFeatures()) { f.feature.decode(this, f.reportID); } } public void setReportedFeature(int f, Object data) { reportdata[f] = data; } public Object getReportedFeature(int f) { return reportdata[f]; } private Object getReportedFeature(ReportedFeature f) { return reportdata[f.reportID]; } public String getString(ReportedFeature feature) { return (String)getReportedFeature(feature); } public String getString(ReportableFeature feature) { return (String)getReportedFeature(retrievalmodel.getReportID(feature)); } public int getInt(ReportedFeature feature) { return (Integer)getReportedFeature(feature); } public int getInt(ReportableFeature feature) { return (Integer)getReportedFeature(retrievalmodel.getReportID(feature)); } public double getDouble(ReportedFeature feature) { return (Double)getReportedFeature(feature); } public int[] getIntArray(ReportedFeature feature) { return (int[])getReportedFeature(feature); } public int[] getIntArray(ReportableFeature feature) { return (int[])getReportedFeature(retrievalmodel.getReportID(feature)); } @Override public void write(StructureWriter writer) { writer.write(partition); writer.write(docid); writer.write(score); writer.write(reportdata.length); for (ReportedFeature c : retrievalmodel.getReportableFeatures()) { c.feature.encode(this, c.reportID); } for (int i = 0; i < reportdata.length; i++) { writer.write((byte[]) reportdata[ i]); } writer.write(report); } @Override public void read(StructureReader reader) { try { partition = reader.readInt(); docid = reader.readInt(); score = reader.readDouble(); int featurescount = reader.readInt(); reportdata = new Object[featurescount]; for (int i = 0; i < featurescount; i++) { reportdata[i] = reader.readByteArray(); } report = reader.readStringBuilder(); } catch (EOCException ex) { log.exception(ex, "read( %s )", reader); } } /** * For debug purposes: formats using * {@link Lib.PrintTools#sprintf(java.lang.String, java.lang.Object[])} and * adds it to the document report *

* @param s template * @param o parameters */ public void addReport(String s, Object... o) { if (report == null) { report = new StringBuilder(); } report.append(io.github.repir.tools.lib.PrintTools.sprintf(s, o)); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy