org.wikibrain.spatial.cookbook.CalculateAllDistancePairs Maven / Gradle / Ivy
The newest version!
package org.wikibrain.spatial.cookbook;
import au.com.bytecode.opencsv.CSVWriter;
import com.vividsolutions.jts.geom.Geometry;
import com.vividsolutions.jts.geom.Point;
import gnu.trove.map.hash.TIntObjectHashMap;
import org.geotools.referencing.GeodeticCalculator;
import org.wikibrain.conf.Configurator;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.model.NameSpace;
import org.wikibrain.spatial.dao.SpatialDataDao;
import org.wikibrain.sr.SRMetric;
import org.wikibrain.wikidata.WikidataDao;
import gnu.trove.map.TIntObjectMap;
import java.io.File;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Created by toby on 4/15/14.
*/
public class CalculateAllDistancePairs {
private static final Logger LOG = LoggerFactory.getLogger(CalculateAllDistancePairs.class);
public static void main(String[] args) throws Exception {
File f=new File("./distance_output.csv");
//Change this if we'll use more than 25 languages - otherwise it will overflow
String[] entries = new String[55];
CSVWriter csvWriter = new CSVWriter(new FileWriter(f), ',');
Env env = EnvBuilder.envFromArgs(args);
Configurator c = env.getConfigurator();
SpatialDataDao sdDao = c.get(SpatialDataDao.class);
WikidataDao wdDao = c.get(WikidataDao.class);
LocalPageDao lpDao = c.get(LocalPageDao.class);
//TODO: modify this field if we want multi-language
LanguageSet langs = env.getLanguages();
Map langIdEnsembleSRMetricMap = new HashMap();
Map langIdInlinkSRMetricMap = new HashMap();
for(Language lang : langs.getLanguages()){
langIdEnsembleSRMetricMap.put(new Integer(lang.getId()), c.get(SRMetric.class, "ensemble", "language", lang.getLangCode()));
langIdInlinkSRMetricMap.put(new Integer(lang.getId()), c.get(SRMetric.class, "inlink", "language", lang.getLangCode()));
}
Map idGeomMap = sdDao.getAllGeometriesInLayer("wikidata", "earth");
TIntObjectMap idNameMap = new TIntObjectHashMap();
LOG.info(String.format("Get %d geometries, now building id-name mapping", idGeomMap.size()));
int counter1 = 0;
for(Integer wdItem : idGeomMap.keySet()){
counter1 ++;
if(counter1 % 1000 == 0)
LOG.info(String.format("Finish building name mapping for %d items", counter1));
boolean containAllLanguage = true;
for(Language language : langs.getLanguages()){
if(! lpDao.getLoadedLanguages().containsLanguage(language)){
throw new DaoException(String.format("Language %s not loaded", language.getEnLangName()));
}
if(wdDao.getItem(wdItem).getLabels().get(language) == null){
containAllLanguage = false;
break;
}
}
if (!containAllLanguage)
continue;
//TODO: This one should be changed if we switch to full English
String name = wdDao.getItem(wdItem).getLabels().get(langs.getDefaultLanguage());
if(name == null)
continue;
idNameMap.put(wdItem, name);
}
LOG.info(String.format("Finish building id-name mapping for %d entities", idNameMap.size()));
GeodeticCalculator calc = new GeodeticCalculator();
int counter = 0;
int[] keyArray = idNameMap.keySet().toArray();
int Max = idNameMap.keySet().size()-1;
entries[0] = "ITEM_NAME_1";
entries[1] = "ITEM_ID_1";
entries[2] = "ITEM_NAME_2";
entries[3] = "ITEM_ID_2";
entries[4] = "SPATIAL_DISTANCE";
int lang_counter = 0;
List langList = new ArrayList();
for(Language language : langs.getLanguages())
langList.add(language);
for(Language language : langList){
entries[5 + 2 * lang_counter] = "SR_ENSEMBLE_" + language.getLangCode();
entries[6 + 2 * lang_counter] = "SR_INLINK_" + language.getLangCode();
lang_counter ++;
}
csvWriter.writeNext(entries);
//TODO: Number of data pairs we want
for(counter = 0; counter < 1000; counter ++){
if (counter % 100 == 0){
LOG.info(String.format("Finish calculating %d pairs", counter));
csvWriter.flush();
}
int x1 = (int)(Math.random() * (Max + 1));
int x2 = (int)(Math.random() * (Max + 1));
Integer item1 = new Integer(keyArray[x1]);
Integer item2 = new Integer(keyArray[x2]);
try{
Geometry g1 = sdDao.getGeometry(item1, "wikidata", "earth");
Point centroid = g1.getCentroid();
calc.setStartingGeographicPoint(centroid.getX(), centroid.getY());
Geometry g2 = sdDao.getGeometry(item2, "wikidata", "earth");
centroid = g2.getCentroid();
calc.setDestinationGeographicPoint(centroid.getX(), centroid.getY());
entries[0] = idNameMap.get(item1);
entries[1] = item1.toString();
entries[2] = idNameMap.get(item2);
entries[3] = item2.toString();
entries[4] = new Double(calc.getOrthodromicDistance()/1000).toString();
lang_counter = 0;
for(Language language : langList){
int pageId1 = lpDao.getIdByTitle(wdDao.getItem(item1).getLabels().get(language), language, NameSpace.ARTICLE);
int pageId2 = lpDao.getIdByTitle(wdDao.getItem(item2).getLabels().get(language), language, NameSpace.ARTICLE);
try{
entries[5 + 2 * lang_counter] = String.valueOf(langIdEnsembleSRMetricMap.get(new Integer(language.getId())).similarity(pageId1, pageId2, false).getScore());
}
catch (Exception e){
entries[5 + 2 * lang_counter] = "ERROR";
}
try {
entries[6 + 2 * lang_counter] = String.valueOf(langIdInlinkSRMetricMap.get(new Integer(language.getId())).similarity(pageId1, pageId2, false).getScore());
}
catch (Exception e){
entries[6 + 2 * lang_counter] = "ERROR";
}
lang_counter ++;
}
csvWriter.writeNext(entries);
}
catch (Exception e){
csvWriter.writeNext(entries);
csvWriter.flush();
//do nothing
}
}
/*
for(Integer item1 : idGeomMap.keySet()){
counter1 ++;
if(counter1 % 1 == 0)
LOG.log(Level.INFO, String.format("Finish calculating for wikidata %d", counter1));
int counter2 = 0;
Geometry g1 = sdDao.getGeometry(item1, "wikidata", "earth");
Point centroid = g1.getCentroid();
calc.setStartingGeographicPoint(centroid.getX(), centroid.getY());
for(Integer item2 : idGeomMap.keySet()){
counter2 ++;
if(counter2 % 1000 == 0)
LOG.log(Level.INFO, String.format("Finish calculating %d pairs for wikidata %d",counter2, counter1));
Geometry g2 = sdDao.getGeometry(item2, "wikidata", "earth");
centroid = g2.getCentroid();
calc.setDestinationGeographicPoint(centroid.getX(), centroid.getY());
entries[0] = idNameMap.get(item1);
entries[1] = item1.toString();
entries[2] = idNameMap.get(item2);
entries[3] = item2.toString();
entries[4] = new Double(calc.getOrthodromicDistance()/1000).toString();
csvWriter.writeNext(entries);
}
csvWriter.flush();
}
*/
csvWriter.writeNext(entries);
csvWriter.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy