api.wikibrain-core.0.7.2.source-code.reference.conf Maven / Gradle / Ivy
// A default configuration file in HOCON format, almost JSON format
// The file format is described at https://github.com/typesafehub/config.
// Parent directory for data files, downloads, scripts, etc.
baseDir : .
// Directory used for temporary files.
// Override this if you don't have hundreds of GBs free in your system's tmp directory.
tmpDir : ${baseDir}"/.tmp"
// Maximum number of threads that should run simultaneously
// defaults to Runtime.getRuntime().availableProcessors()
maxThreads : -1
// Language sets
// You can specify a custom language set from the command line.
// See EnvBuilder for more information.
languages : {
// by default use the languages that have local pages
default : loaded
// languages that have local pages loaded
loaded : { type : loaded }
// languages that have downloaded articles files
downloaded : { type : downloaded }
// the largest world economies
big-economies : {
type : custom
langCodes : [
"en","de","fr","nl","it","pl","es","ru","ja","pt","zh",
"sv","uk","ca","no","fi","cs","hu","ko","id","tr","ro",
"sk","da","he","simple"
]
}
// the english languages
all-english : {
type : custom
langCodes : ["en", "simple"]
}
// This can be dynamically populated using the EnvBuilder or command line
manual : {
type : custom
langCodes : []
}
}
// Filesets downloaded by default
download : {
matcher : ["articles", "links"]
path : ${baseDir}"/download"
listFile : ${download.path}"/list.tsv"
}
// Configuration for the lucene search engine.
lucene : {
version : "4.3"
directory : ${baseDir}"/db/lucene"
options : {
default : plaintext
plaintext : {
type : plaintext
version : ${lucene.version}
directory : ${lucene.directory}
namespaces : ["article"]
// TokenizerOptions
caseInsensitive : true
useStopWords : true
useStem : true
// TextFieldElements
title : 0
redirects : false
plaintext : true
}
esa : {
type : esa
version : ${lucene.version}
directory : ${lucene.directory}
namespaces : ["article"]
// TokenizerOptions
caseInsensitive : true
useStopWords : true
useStem : true
// TextFieldElements
title : 1
redirects : true
plaintext : true
}
}
searcher : {
esa : {
options : esa
}
}
}
// multilingual string normalizers
stringnormalizers {
default : identity
// do nothing
identity : {
type : identity
}
// remove punctuation
simple : {
type : lucene
version : ${lucene.version}
caseInsensitive : false
useStopWords : false
useStem : false
}
// removes punctuation, folds case
foldcase : {
type : lucene
version : ${lucene.version}
caseInsensitive : true
useStopWords : false
useStem : false
}
// fold case, porter stemming
stemmer : {
type : lucene
version : ${lucene.version}
caseInsensitive : true
useStopWords : false
useStem : true
}
}
// phrase analyzers resolve phrases to articles and vice-versa
phrases {
// whether or not the phrase analyzers are being loaded.
// will be overridden while saving corpora to the daos
loading : false
// base path for all phrase analyzer database
path : ${baseDir}"/db/phrases/"
// which analyzers should be loaded by the loader by default
toLoad : [ "anchortext" ]
// Analyzers
analyzer : {
default : fast-cascading
stanford : {
phraseDao : stanford
localPageDao : default
path : ${download.path}"/stanford-dictionary.bz2"
url : "http://www-nlp.stanford.edu/pubs/crosswikis-data.tar.bz2/dictionary.bz2"
type : stanford
phrasePruner : {
type : string
minCount : 3,
maxRank : 10,
minFraction : 0.001
}
pagePruner : {
type : simple
minCount : 3,
maxRank : 15,
minFraction : 0.001
}
dao : {
isNew : ${phrases.loading}
type : objectdb
normalizer : default
}
}
anchortext : {
phraseDao : anchortext
localPageDao : default
localLinkDao : default
type : anchortext
phrasePruner : {
type : string
minCount : 1,
maxRank : 10,
minFraction : 0.001
}
pagePruner : {
type : simple
minCount : 1,
maxRank : 15,
minFraction : 0.001
}
dao : {
isNew : ${phrases.loading}
type : objectdb
normalizer : default
}
}
anchortext-foldcase : ${phrases.analyzer.anchortext} {
dao.normalizer : foldcase
}
anchortext-stemmed : ${phrases.analyzer.anchortext} {
dao.normalizer : stemmer
}
lucene : {
type : lucene
localPageDao : default
}
cascading : {
type : cascading
delegates : [ "stanford", "lucene" ]
}
fast-cascading : {
type : cascading
delegates : [ "anchortext", "lucene" ]
}
titleRedirect{
type: titleRedirect
useRedirects : true
localPageDao : default
redirectDao : default
}
}
linkProbability : {
objectDb : {
path : ${baseDir}"/db/phrases/linkProbability"
phraseAnalyzer : anchortext
}
}
}
// data access objects
dao : {
dataSource : {
default : h2
h2 : {
driver : org.h2.Driver
url: "jdbc:h2:"${baseDir}"/db/h2;LOG=0;CACHE_SIZE=65536;LOCK_MODE=0;UNDO_LOG=0;MAX_OPERATION_MEMORY=100000000"
username : sa
password : ""
// Connection pooling
// This sets the total number of jdbc connections to a minimum of 16.
// partitions defaults to max(8, num-logical-cores)
partitions : default
connectionsPerPartition : 2
}
psql : {
driver : org.postgresql.Driver
url: "jdbc:postgresql://localhost/wikibrain"
username : toby
password : ""
// Connection pooling
// This sets the total number of jdbc connections to a minimum of 16.
// partitions defaults to max(8, num-logical-cores)
partitions : default
connectionsPerPartition : 2
}
}
metaInfo : {
default : sql
sql : {
type : sql
dataSource : default
}
live : {}
}
sqlCachePath : ${baseDir}"/db/sql-cache"
localPage : {
default : sql
sql : {
type : sql
dataSource : default
}
live : {
type : live
}
}
pageView : {
default : sql
sql : {
type : sql
dir : ${baseDir}"/download/pageviews"
dataSource : default
}
db : {
type : db
}
}
interLanguageLink : {
default : sql
sql : {
type : sql
dataSource : default
}
}
localLink : {
default : matrix
sql : {
type : sql
dataSource : default
}
matrix : {
type : matrix
delegate : sql
path : ${baseDir}"/db/matrix/local-link"
}
live : {
type : live
}
}
rawPage : {
default : sql
sql : {
type : sql
dataSource : default
localPageDao : sql
}
live : {}
}
wikidata : {
default : sql
sql : {
type : sql
dataSource : default
localPageDao : sql
}
live : {}
}
universalPage : {
default : wikidata
wikidata : {
type : sql
mapper : purewikidata
dataSource : default
}
monolingual : {
type : sql
mapper : monolingual
dataSource : default
}
live : {}
}
localCategoryMember : {
default : sql
sql : {
type : sql
dataSource: default
}
live : {
type : live
}
}
localArticle : {
default : sql
sql : {
type : sql
dataSource: default
}
live : {
type : live
}
}
localCategory : {
default : sql
sql : {
type : sql
dataSource: default
}
live : {
type : live
}
}
universalLink : {
default : sql-wikidata
sql-wikidata : {
type : sql
dataSource : default
mapper : purewikidata
localLinkDao : sql
}
skeletal-sql-wikidata : {
type : skeletal-sql
mapper : purewikidata
dataSource : default
}
live : {}
}
redirect : {
default : sql
sql : {
type : sql
dataSource : default
}
live : {
type : live
}
}
diagnostic : {
default : parse
parse : {
tokenFile : ${baseDir}"/dat/token.txt"
log : ${baseDir}"/dat/stages.csv"
appId : "6VaQtzvMSzzXXm0VMlm9IHKUYXAYwKRlsW19faV7"
restApiId: "cxkuWsM6rIEVamS0OU5zpbKvuChzK8nw4XXQVwjJ"
}
}
}
mapper : {
default : purewikidata
monolingual : {
type : monolingual
algorithmId : 0 // each algorithm must have a unique ID
localPageDao : sql
}
purewikidata : {
type : purewikidata
algorithmId : 1
localPageDao : sql
}
}
sr : {
disambig : {
default : similarity
topResult : {
type : topResult
phraseAnalyzer : default
}
titleRedirect : {
type : topResult
phraseAnalyzer : titleRedirect
}
topResultConsensus : {
type : topResultConsensus
phraseAnalyzers : ["lucene","stanford","anchortext"]
}
milnewitten : {
type : milnewitten
metric : milnewitten
phraseAnalyzer : default
}
similarity : {
type : similarity
metric : inlinknotrain
phraseAnalyzer : default
// how to score candidate senses. Possibilities are:
// popularity: just popularity
// similarity: just similarity
// product: similarity * popularity
// sum: similarity + popularity
criteria : sum
}
}
concepts {
path : ${baseDir}"/dat/sr/concepts/"
}
blacklist {
path : ""
}
// The parent configuration for all vector-based SR metrics
sparsevectorbase {
type : sparsevector
pageDao : default
disambiguator : default
// Concrete metrics must override the generator
generator : { type : OVERRIDE_THIS }
// Default vector similarity is cosine similarity
similarity : { type : cosine }
// normalizers
similaritynormalizer : percentile
mostsimilarnormalizer : percentile
// Controls how phrase vectors are created. Values can be:
// none: do not create phrase vectors. disambiguate instead.
// generator: ask the feature generator to create the phrase vectors
// creator: ask the phrase vector create to create the phrase vectors
// both: first ask the generator, then the creator
phraseMode : none
}
// The parent configuration for all vector-based SR metrics
densevectorbase {
type : densevector
pageDao : default
disambiguator : default
// Concrete metrics must override the generator
generator : { type : OVERRIDE_THIS }
// Default vector similarity is cosine similarity
similarity : { type : cosine }
// normalizers
similaritynormalizer : percentile
mostsimilarnormalizer : percentile
}
metric {
// when training, normalizers are not read from disk
training : false
path : ${baseDir}"/dat/sr/"
local : {
default : milnewitten
ESA : ${sr.sparsevectorbase} {
generator : {
type : esa
luceneSearcher : esa
concepts : ${sr.concepts.path}
}
similarity : { type : cosine }
}
word2vec-nophrase : ${sr.densevectorbase} {
generator : {
type : word2vec
corpus : plain
modelDir : ${baseDir}"/dat/word2vec"
}
}
word2vec : ${sr.densevectorbase} {
generator : {
type : word2vec
corpus : wikified
modelDir : ${baseDir}"/dat/word2vec"
}
reliesOn : [ "word2vecRaw" ]
}
// Used for performance reasons in the websail wikifier
word2vecRaw : ${sr.densevectorbase} {
generator : {
type : word2vec
corpus : plain
modelDir : ${baseDir}"/dat/word2vecRaw"
}
similaritynormalizer : identity
mostsimilarnormalizer : identity
}
// Uses prebuilt 300-dimensional vectors from Google
// Requires you to download "GoogleNews-vectors-negative300.bin.gz"
// and place it in your download directory.
prebuiltword2vec : ${sr.densevectorbase} {
generator : {
type : word2vec
corpus : NONE
binfile : ${baseDir}"/download/GoogleNews-vectors-negative300.bin.gz"
languages : ["simple", "en"]
modelDir : ${baseDir}"/dat/prebuiltword2vec"
url : "https://code.google.com/p/word2vec/"
prebuilt : true
}
similarity : { type : cosine }
}
ESAnotrain : ${sr.sparsevectorbase} {
generator : {
type : esa
luceneSearcher : esa
concepts : ${sr.concepts.path}
}
similarity : { type : cosine }
similaritynormalizer : identity
mostsimilarnormalizer : identity
}
outlink : ${sr.sparsevectorbase} {
generator : {
type : links
outLinks : true
weightByPopularity : true
logTransform : true
}
similarity : {
type : cosine
}
}
inlink : ${sr.sparsevectorbase} {
generator : {
type : links
outLinks : false
}
similarity : {
type : google
}
}
inlinknotrain : ${sr.sparsevectorbase} {
generator : {
type : links
outLinks : false
}
similarity : {
type : google
}
similaritynormalizer : identity
mostsimilarnormalizer : identity
}
directlink : {
type : directlink
disambiguator : default
similaritynormalizer : identity
mostsimilarnormalizer : identity
}
milnewitten : {
type : milnewitten
inlink : inlink
outlink : outlink
disambiguator : milnewitten
similaritynormalizer : identity
mostsimilarnormalizer : identity
}
synrank : {
type : synrank
similaritynormalizer : identity
mostsimilarnormalizer : identity
disambiguator : titleRedirect
}
simplemilnewitten : {
type : simplemilnewitten
}
fast-ensemble : {
type : ensemble
metrics : ["milnewitten","milnewittenout"]
similaritynormalizer : identity
mostsimilarnormalizer : identity
ensemble : linear
disambiguator : default
pageDao : default
}
ensemble : {
type : ensemble
metrics : ["ESA", "category", "milnewitten"]
similaritynormalizer : percentile
mostsimilarnormalizer : percentile
ensemble : linear
resolvephrases : false
disambiguator : default
pageDao : default
}
word2vec-ensemble : {
type : ensemble
metrics : ["ESA","inlink","outlink","category","word2vec","milnewitten","directlink"]
similaritynormalizer : percentile
mostsimilarnormalizer : percentile
ensemble : linear
resolvephrases : false
disambiguator : default
pageDao : default
}
super-ensemble : {
type : ensemble
metrics : ["ESA","inlink","outlink","category","mostsimilarconcepts","milnewitten"]
similaritynormalizer : percentile
mostsimilarnormalizer : percentile
ensemble : linear
resolvephrases : false
disambiguator : default
pageDao : default
}
mostsimilarconcepts : ${sr.sparsevectorbase} {
generator : {
type : mostsimilarconcepts
basemetric : "msc-ensemble"
concepts : ${sr.concepts.path}
maxResults : 1000
}
}
// for most similar concepts
msc-ensemble : {
// Set defaults for training the cosimilarity matrix.
mostSimilarConcepts : ${sr.concepts.path}
maxResults : 1000
type : ensemble
metrics : ["ESA","inlink","outlink","category","milnewitten"]
similaritynormalizer : percentile
mostsimilarnormalizer : rank
ensemble : linear
resolvephrases : false
disambiguator : default
pageDao : default
buildMostSimilarCache : true
mostsimilarnormalizer : rank
}
category :{
type : categorygraphsimilarity
disambiguator : default
pageDao : default
categoryMemberDao : default
similaritynormalizer : percentile
mostsimilarnormalizer : percentile
}
known-phrase : {
type: knownphrase
stringnormalizer: simple
metrics: ["prebuiltword2vec", "ESA", "outlink", "inlink"]
coefficients: [0.25, 0.45, 0.12, 0.18]
}
}
}
corpus {
plain : {
path : ${baseDir}"/dat/corpus/plain/"
wikifier : identity
rawPageDao : default
localPageDao : default
phraseAnalyzer : anchortext
}
wikified : {
path : ${baseDir}"/dat/corpus/wikified/"
wikifier : websail
rawPageDao : default
localPageDao : default
phraseAnalyzer : anchortext
}
}
ensemble {
default : linear
even : {
type : even
}
linear : {
type : linear
}
}
normalizer {
defaultmaxresults : 100
identity : {
type : identity
}
logLoess : {
type : loess
log : true
}
loess : {
type : loess
}
log : {
type : log
}
percentile : {
type : percentile
}
range : {
type : range
min : 0.0
max : 1.0
truncate : true
}
rank : {
type : rank
}
}
explanationformatter {
explanationformatter {
localpagedao : sql
}
}
dataset : {
dao : {
resource : {
type : resource
disambig : topResult
resolvePhrases : true
}
}
defaultsets : ["wordsim353.txt","MC.txt"]
groups : {
// large, commonly used datasets
major-en : ["wordsim353.txt", "MTURK-771.csv", "atlasify240.txt", "radinsky.txt"]
}
// pairs under this threshold won't be used for most similar training.
mostSimilarThreshold : 0.7
records : ${baseDir}"/dat/records/"
}
wikifier : {
default : identity
milnewitten : {
type : milnewitten
phraseAnalyzer : anchortext
sr : inlinknotrain
localLinkDao : matrix
useLinkProbabilityCache : true
}
identity : {
type : identity
localLinkDao : sql
}
websail : {
type : websail
phraseAnalyzer : anchortext
sr : word2vecRaw
identityWikifier : identity
localLinkDao : matrix
useLinkProbabilityCache : true
}
}
}
// spatial
spatial : {
dir : ${baseDir}"/dat/spatial"
dao : {
dataSource : {
// These all use keys standard to Geotools JDBC
// see: http://docs.geotools.org/stable/userguide/library/jdbc/datastore.html
// change this part according to your DB settings
default : postgis
postgis : {
dbtype : postgis
host : localhost
port : 5432
schema : public
database : wikibrain_spatial
user : toby
passwd : ""
max connections : 19
}
}
spatialData : {
default : postgis
postgis{
dataSource : postgis
}
}
spatialContainment : {
default : postgis
postgis{
dataSource : postgis
}
}
spatialNeighbor : {
default : postgis
postgis{
dataSource : postgis
}
}
}
datasets {
earth.country.naturalEarth : {
url : "http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip"
mappingUrl : "http://www.shilad.com/wikibrain/naturalEarth.country.zip"
encoding : "UTF-8"
// Name of shapefile withing the zip archive
shp : ne_10m_admin_0_countries.shp
// Fields that may contain titles, in order of priority
titles : [ "NAME", "NAME_LONG", "FORMAL_EN", "ADMIN"]
// Fields that may contain words that provide context
context : [ "SUBREGION", "CONTINENT" ]
// Fields that will be concatenated together to create a persistent key in the file
key : ["NAME_LONG" ]
// Other fields that should be included in the spreadsheet
other : []
scorers : [
{ type : instanceOf, file : /spatial-matcher/country.txt, weight : 1.0 }
{ type : wikidataValue, property : "FIPS 10-4 (countries and regions)", column : FIPS_10_, weight : 4.0 }
]
dab : topResult
}
earth.state.naturalEarth : {
url : "http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_1_states_provinces.zip"
mappingUrl : "http://www.shilad.com/wikibrain/naturalEarth.state.zip"
encoding : "UTF-8"
// Name of shapefile withing the zip archive
shp : ne_10m_admin_1_states_provinces.shp
// Fields that may contain titles, in order of priority
titles : [ "NAME", "NAME_LOCAL", "NAME_ALT", "WOE_NAME", "GN_NAME" ]
country : [ "ADMIN", "GEONUNIT" ]
fips : [ "FIPS" ]
// Fields that may contain words that provide context
context : [ "REGION" ]
// Fields that will be concatenated together to create a persistent key in the file
key : ["NAME", "ADMIN" ]
// Other fields that should be included in the spreadsheet
other : []
scorers : [
{ type : instanceOf, file : /spatial-matcher/state.txt, weight : 1.0 }
{ type : contains, weight : 1.0 }
{ type : wikidataValue, property : "FIPS 10-4 (countries and regions)", column : FIPS, weight : 4.0 }
{ type : wikidataValue, property : "ISO 3166-2", column : "ISO_3166_2", weight : 4.0 }
]
dab : topResult
}
earth.marine.naturalEarth : {
url : "http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/physical/ne_10m_geography_marine_polys.zip"
mappingUrl : "http://www.shilad.com/wikibrain/naturalEarth.marine.zip"
encoding : "UTF-8"
// Name of shapefile withing the zip archive
shp : ne_10m_geography_marine_polys.shp
// Fields that may contain titles, in order of priority
titles : [ "NAME", "NAMEALT" ]
// Fields that will be concatenated together to create a persistent key in the file
key : ["NAME" ]
context : []
// Other fields that should be included in the spreadsheet
other : []
scorers : [
{ type : instanceOf, file : /spatial-matcher/marine.txt, weight : 1.0 }
{ type : contains, weight : 1.0 }
]
dab : topResult
}
}
}
loader {
groups {
core : [ "fetchlinks", "download", "dumploader", "redirects", "wikitext", "lucene", "phrases"],
multilingual-core : ${loader.groups.core} ["concepts"]
}
// Stages of the loading pipeline, used by PipelineLoader
stages : [
{
name : fetchlinks,
class : org.wikibrain.download.RequestedLinkGetter
extraArgs : []
runtime : "0"
diskSpace : "1"
},
{
name : download,
class : org.wikibrain.download.DumpFileDownloader
dependsOnStage : fetchlinks
extraArgs : []
runtime : "0"
diskSpace : "#{articles} / 250"
downloadSize : "#{articles} / 250"
},
{
name : dumploader,
class : org.wikibrain.loader.DumpLoader
runtime : "#{articles} / #{multiCoreSpeed} / 300.0 + #{articles} / #{singleCoreSpeed} / 4000.0"
dependsOnStage : download
loadsClass : LocalPage
extraArgs : ["-d"]
diskSpace : "#{articles} / 175"
},
{
name : redirects,
class : org.wikibrain.loader.RedirectLoader
dependsOnStage : dumploader
loadsClass : Redirect
extraArgs : ["-d"]
runtime : "#{articles} / #{singleCoreSpeed} / 15000.0"
diskSpace : "#{articles} / 3500"
},
{
name : wikitext,
class : org.wikibrain.loader.WikiTextLoader
loadsClass : LocalLink
dependsOnStage : redirects
extraArgs : ["-d"],
runtime : "#{links} / #{multiCoreSpeed} / 3000.0 + #{links} / #{singleCoreSpeed} / 25000.0"
diskSpace : "#{links} / 8000"
},
{
name : lucene,
class : org.wikibrain.loader.LuceneLoader
loadsClass : LuceneSearcher
dependsOnStage : wikitext
extraArgs : []
runtime : "#{articles} / #{multiCoreSpeed} / 90.0"
diskSpace : "#{articles} / 140"
},
{
name : phrases,
class : org.wikibrain.loader.PhraseLoader
loadsClass: PrunedCounts
dependsOnStage : wikitext
extraArgs : ["-p", "anchortext"],
runtime : "#{links} / #{singleCoreSpeed} / 90000.0"
diskSpace : "#{links} / 40000"
},
{
name : concepts,
class : org.wikibrain.loader.ConceptLoader
dependsOnStage : redirects
loadsClass : UniversalPage
extraArgs : ["-d"],
runtime : "#{articles} / #{singleCoreSpeed} / 3000.0 + 300"
diskSpace : "#{articles} / 3500"
downloadSize : "550"
},
{
name : universal,
class : org.wikibrain.loader.UniversalLinkLoader
dependsOnStage : [ "concepts", "wikitext" ]
loadsClass: UniversalLink
extraArgs : ["-d"],
runtime : "#{links} / #{multiCoreSpeed} / 3000.0 + #{links} / #{singleCoreSpeed} / 25000.0"
diskSpace : "#{links} / 4000"
},
{
name : wikidata,
class : org.wikibrain.wikidata.WikidataDumpLoader
dependsOnStage : concepts
loadsClass: WikidataEntity
extraArgs : ["-d"],
runtime : "900 + #{articles} / #{multiCoreSpeed} / 300.0"
diskSpace : "#{links} / 60000"
downloadSize : "2400"
},
{
name : spatial,
class : org.wikibrain.spatial.loader.SpatialDataLoader
dependsOnStage : wikidata
loadsClass: Geometry
extraArgs : ["-d" ],
runtime : "900 / #{singleCoreSpeed}"
diskSpace : "200"
downloadSize : "50"
}
{
name : sr,
class : org.wikibrain.sr.SRBuilder
dependsOnStage : ["wikitext", "phrases", "lucene"]
extraArgs : ["-m", "milnewitten", "-o", "both"],
runtime : "#{articles} / #{singleCoreSpeed} / 15000.0 + #{articles} / #{multiCoreSpeed} / 2000.0"
diskSpace : "#{links} / 24000"
}
]
}
// backup for integration tests
integration {
dir : ${baseDir}"/backup"
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy