Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package breeze.classify
/*
Copyright 2010 David Hall, Daniel Ramage
Licensed under the Apache License, Version 2.0 (the "License")
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import breeze.linalg._
import breeze.numerics._
import breeze.data._
import breeze.optimize._
import breeze.optimize.FirstOrderMinimizer.OptParams
import breeze.math.MutableCoordinateSpace
import breeze.util.Index
import breeze.util.logging.ConsoleLogging
/**
* A multi-class logistic/softmax/maxent classifier.
*
* @author dlwh
*/
object LogisticClassifier {
def main(args: Array[String]) {
val data = DataMatrix.fromURL(new java.net.URL("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/spam.data"),-1)
val vectors = data.rows.map(e => e map ((a:Seq[Double]) => new DenseVector(a.toArray)) relabel (_.toInt))
val classifier = new LogisticClassifier.Trainer[Int,DenseVector[Double]].train(vectors)
for( ex <- vectors) {
val guessed = classifier.classify(ex.features)
println(guessed,ex.label)
}
}
/**
* @tparam L: the label type
* @tparam TF feature vectors, which are the input vectors to the classifer
* @return a LinearClassifier based on the fitted model
*/
class Trainer[L,TF](opt: OptParams = OptParams())(implicit arith: MutableCoordinateSpace[TF, Double],
man: ClassManifest[TF]) extends Classifier.Trainer[L,TF] {
import arith._
type MyClassifier = LinearClassifier[L,UnindexedLFMatrix[L,TF],Counter[L,Double],TF]
def train(data: Iterable[Example[L,TF]]) = {
require(data.size > 0)
val labelIndex = Index[L]()
data foreach { ex => labelIndex.index(ex.label) }
val guess = new LFMatrix[L,TF](zeros(data.head.features), labelIndex)
val obj = new CachedBatchDiffFunction(objective(data.toIndexedSeq, labelIndex))(LFMatrix.canCopy(copy))
// val adjusted = SecondOrderFunction.empirical(obj)
// val adjusted = new FisherDiffFunction(obj)
// val weights = new TruncatedNewtonMinimizer[LFMatrix[L, TF], EmpiricalHessian[LFMatrix[L, TF]]](l2Regularization = 1.0).minimize(adjusted, guess)
val weights = opt.minimize(obj,guess)(LFMatrix.coordSpace)
//
// val weights = (new LBFGS[LFMatrix[L, TF]]() with ConsoleLogging).minimize(DiffFunction.withL2Regularization(obj, 1.0), guess)
new LinearClassifier(weights.unindexed,Counter[L,Double]())
}
protected def objective(data: IndexedSeq[Example[L,TF]], labelIndex: Index[L]) = new ObjectiveFunction(data, labelIndex)
// preliminaries: an objective function
protected class ObjectiveFunction(data: IndexedSeq[Example[L,TF]], labelIndex: Index[L]) extends BatchDiffFunction[LFMatrix[L,TF]] {
// Computes the dot product for each label
def logScores(weights: LFMatrix[L,TF], datum: TF): DenseVector[Double] = {
weights * datum
}
val fullRange = (0 until data.size)
override def calculate(weights: LFMatrix[L,TF], range: IndexedSeq[Int]) = {
var ll = 0.0
val grad = weights.empty
assert(!breeze.linalg.norm(weights).isNaN, weights)
for( datum <- range.view map data) {
val logScores: DenseVector[Double] = this.logScores(weights,datum.features)
val logNormalizer = softmax(logScores)
val goldLabel = labelIndex(datum.label)
ll -= (logScores(goldLabel) - logNormalizer)
assert(!ll.isNaN, logNormalizer + " " + logScores + " " + weights + " " + datum + " " + (weights * datum.features))
// d ll/d weight_kj = \sum_i x_ij ( I(group_i = k) - p_k(x_i;Beta))
for ( label <- 0 until labelIndex.size ) {
val prob_k = math.exp(logScores(label) - logNormalizer)
assert(prob_k >= 0 && prob_k <= 1,prob_k)
axpy(-(I(label == goldLabel) - prob_k), datum.features, grad(label))
}
}
assert(!breeze.linalg.norm(grad).isNaN, grad)
grad *= (data.size * 1.0 / range.size)
ll *= (data.size * 1.0 / range.size)
(ll,grad)
}
}
}
}
/**
* This is an example app for creating a logistic classifier from data that is
* stored as string valued features and string valued labels, e.g.
*
* verb=join,noun=board,prep=as,prep_obj=director,V
* verb=isIs,noun=chairman,prep=of,prep_obj=N.V.,N
* verb=named,noun=director,prep=of,prep_obj=conglomerate,N
*
* These are examples from Ratnarparkhi's classic prepositional phrase attachment
* dataset, discussed in the following homework:
*
* http://ata-s12.utcompling.com/assignments/classification
*
* The homework includes pointers to the data and to Scala code for generating
* said features.
*
* This example handles creating a feature index and getting the examples into the
* right data structures for training with the logistic regression classifier,
* which should serve as a useful example for creating features and classifiers
* using the API.
*
* @author jasonbaldridge
*/
object LogisticClassifierFromCsv {
import breeze.util.Index
import breeze.config._
import breeze.stats.ContingencyStats
import Counter._
import java.io.File
case class Params(
@Help(text="Input training file in CSV format.") train: File,
@Help(text="Input eval file in CSV format.") eval: File,
@Help(text="Show evaluation.") showEval: Boolean = false,
@Help(text="Output full distributions.") fullOutput: Boolean = false,
@Help(text="Regularization value (default 1.0).") reg: Double = 1.0,
@Help(text="Tolerance (stopping criteria) (default 1E-4).") tol: Double = 1E-4,
@Help(text="Maximum number of iterations (default 1000).") maxIterations: Int = 1000,
@Help(text="Optimization parameters") opt: OptParams,
@Help(text="Prints this") help:Boolean = false
)
def main(args: Array[String]) {
val config = CommandLineParser.parseArguments(args)._1
val params = config.readIn[Params]("")
// Feature map
val fmap = Index[String]()
// Read in the training data and index it.
val trainingData =
SparseCsvDataset(io.Source.fromFile(params.train))
.map(ex => ex.map(_.map(fmap.index(_))))
.toList // Need to consume the lines in order to populate the feature map
val numFeatures = fmap.size
val trainingExamples =
trainingData
.zipWithIndex
.map { case(ex, rowId) => {
Example[String,SparseVector[Double]](
label=ex.label,
features=makeSparseFeatureVector(ex, numFeatures),
id=rowId.toString
)
}}
// Train the classifier
val opt = params.opt
val classifier =
new LogisticClassifier.Trainer[String,SparseVector[Double]](opt)
.train(trainingExamples)
// Read in the evaluation data
val evalExamples =
SparseCsvDataset(io.Source.fromFile(params.eval))
.zipWithIndex
.map { case(ex, rowId) => {
val mappedEx = ex.map(_.flatMap(fmap.indexOpt(_)))
Example[String, SparseVector[Double]](
label=ex.label,
features=makeSparseFeatureVector(mappedEx, numFeatures),
id=rowId.toString
)
}}
.toList
// Get the predictions
val predictions = evalExamples.map(ex => classifier.scores(ex.features))
// Output full predictions
if (params.fullOutput) {
predictions.foreach { prediction => {
val pcounter = prediction
val distribution = exp(logNormalize(pcounter))
val sortedDistributionString =
distribution
.argsort
.reverse
.map(label => label + " " + distribution(label)).mkString(" ")
println(sortedDistributionString)
}}
}
// Output evaluation statistics
if (params.showEval) {
println("\n-----------------------------------------------------")
println("Training set 'performance':")
println(ContingencyStats(classifier,trainingExamples))
println("\n-----------------------------------------------------")
println("Eval set performance:")
println(ContingencyStats(classifier, evalExamples))
}
}
/**
* Creates a sparse feature vector from an indexed example. Assumes we have
* observations of each feature, and these could show up multiple times, so
* we count those occurences and use those as the values stored in the
* vector.
*/
private def makeSparseFeatureVector (example: Example[String, Seq[Int]], numFeatures: Int) = {
val vec = SparseVector.zeros[Double](numFeatures)
example
.features
.groupBy(x=>x)
.mapValues(_.length.toDouble)
.foreach { case(index, value) => vec(index) = value }
vec
}
/**
* Read in a dataset and create Examples from it. Don't do any feature indexation,
* since for training data we want to build the index, but for eval data we just
* want to use it.
*/
object SparseCsvDataset {
def apply(dataSource: io.Source): Iterator[Example[String, Seq[String]]] =
dataSource
.getLines
.zipWithIndex
.map { case(line,rowId) => {
val lineData = line.split(",")
val (features, label) = (lineData.dropRight(1), lineData.last);
Example[String, Seq[String]](label=label, features=features, id=rowId.toString)
}}
}
}