com.lucidworks.spark.analysis.LuceneTextAnalyzer.scala Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.lucidworks.spark.analysis

import java.io.{PrintWriter, Reader, StringWriter}
import java.util.regex.Pattern
import com.lucidworks.spark.util.Utils
import org.apache.commons.io.IOUtils
import org.apache.lucene.analysis.custom.CustomAnalyzer
import org.apache.lucene.analysis.tokenattributes.{CharTermAttribute, OffsetAttribute, PositionIncrementAttribute}
import org.apache.lucene.analysis.{Analyzer, DelegatingAnalyzerWrapper, TokenStream}
import org.apache.lucene.util.{Version => LuceneVersion}
import org.apache.solr.schema.JsonPreAnalyzedParser
import org.json4s.jackson.JsonMethods._
import org.json4s.jackson.Serialization

import scala.collection.convert.ImplicitConversions.`list asScalaBuffer`
import scala.collection.immutable
import scala.collection.mutable
import scala.collection.JavaConverters._
import scala.collection.convert.ImplicitConversions._
import scala.util.control.Breaks._
import scala.util.control.NonFatal

/**
  * This class allows simple access to custom Lucene text processing pipelines, a.k.a. text analyzers,
  * which are specified via a JSON schema that hosts named analyzer specifications and mappings from
  * field name(s) to analyzer(s).
  *
  * Here's an example schema with descriptions inline as comments:
  * {{{
  * {
  *   "defaultLuceneMatchVersion": "7.0.0" // Optional.  Supplied to analysis components
  *                                         //     that don't explicitly specify "luceneMatchVersion".
  *   "analyzers": [              // Optional.  If not included, all field mappings must be
  *     {                         //     to fully qualified class names of Lucene Analyzer subclasses.
  *       "name": "html",         // Required.  Mappings in the "fields" array below refer to this name.
  *       "charFilters":[{        // Optional.
  *         "type": "htmlstrip"   // Required. "htmlstrip" is the SPI name for HTMLStripCharFilter
  *       }],
  *       "tokenizer": {          // Required.  Only one allowed.
  *         "type": "standard"    // Required. "standard" is the SPI name for StandardTokenizer
  *       },
  *       "filters": [{           // Optional.
  *           "type": "stop",     // Required.  "stop" is the SPI name for StopFilter
  *           "ignoreCase": "true",  // Component-specific params
  *           "format": "snowball",
  *           "words": "org/apache/lucene/analysis/snowball/english_stop.txt"
  *         }, {
  *           "type": "lowercase" // Required. "lowercase" is the SPI name for LowerCaseFilter
  *       }]
  *     },
  *     { "name": "stdtok", "tokenizer": { "type": "standard" } }
  *   ],
  *   "fields": [{                // Required.  To lookup an analyzer for a field, first the "name"
  *                               //     mappings are consulted, and then the "regex" mappings are
  *                               //     tested, in the order specified.
  *       "name": "keywords",     // Either "name" or "regex" is required.  "name" matches the field name exactly.
  *       "analyzer": "org.apache.lucene.analysis.core.KeywordAnalyzer" // FQCN of an Analyzer subclass
  *     }, {
  *       "regex": ".*html.*"     // Either "name" or "regex" is required.  "regex" must match the whole field name.
  *       "analyzer": "html"      // Reference to the named analyzer specified in the "analyzers" section.
  *     }, {
  *       "regex": ".+",          // Either "name" or "regex" is required.  "regex" must match the whole field name.
  *       "analyzer": "stdtok"    // Reference to the named analyzer specified in the "analyzers" section.
  *   }]
  * }
  * }}}
  */
class LuceneTextAnalyzer(analysisSchema: String) extends Serializable {
  @transient private lazy val analyzerSchema = new AnalyzerSchema(analysisSchema)
  @transient private lazy val analyzerCache = mutable.Map.empty[String, Analyzer]
  def isValid: Boolean = analyzerSchema.isValid
  def invalidMessages: String = analyzerSchema.invalidMessages.result()
  /** Returns the analyzer mapped to the given field in the configured analysis schema, if any. */
  def getFieldAnalyzer(field: String): Option[Analyzer] = analyzerSchema.getAnalyzer(field)

  def analyze(field: String, o: Any): Seq[String] = {
    o match {
      case s: String => analyze(field, s)
      case as: mutable.WrappedArray[String] @unchecked => analyzeMV(field, as)
      case a: Any => analyze(field, a.toString)
      case _ => Seq.empty[String]
    }
  }

  def analyzeJava(field: String, o: Any): java.util.List[String] = {
    seqAsJavaList(analyze(field, o))
  }

  /** Looks up the analyzer mapped to the given field from the configured analysis schema,
    * uses it to perform analysis on the given string, returning the produced token sequence.
    */
  def analyze(field: String, str: String): Seq[String] = {
    if ( ! isValid) throw new IllegalArgumentException(invalidMessages)
    if (str == null) return Seq.empty[String]
    analyze(tokenStream(field, str))
  }
  /** Looks up the analyzer mapped to the given field from the configured analysis schema,
    * uses it to perform analysis on the given reader, returning the produced token sequence.
    */
  def analyze(field: String, reader: Reader): Seq[String] = {
    if ( ! isValid) throw new IllegalArgumentException(invalidMessages)
    analyze(tokenStream(field, reader))
  }
  /** For each of the field->value pairs in fieldValues, looks up the analyzer mapped
    * to the field from the configured analysis schema, and uses it to perform analysis on the
    * value.  Returns a map from the fields to the produced token sequences.
    */
  def analyze(fieldValues: immutable.Map[String,String]): immutable.Map[String,Seq[String]] = {
    val builder = immutable.Map.newBuilder[String,Seq[String]]
    for ((field, value) <- fieldValues) builder += field -> analyze(field, value)
    builder.result()
  }
  /** Looks up the analyzer mapped to the given field from the configured analysis schema,
    * uses it to perform analysis on each of the given values, and returns the flattened
    * concatenation of the produced token sequence.
    */
  def analyzeMV(field: String, values: Seq[String]): Seq[String] = {
    if (values == null) return Seq.empty[String]
    val seqBuilder = Seq.newBuilder[String]
    values foreach { value => seqBuilder ++= analyze(field, value) }
    seqBuilder.result()
  }
  /** For each of the field->multi-value pairs in fieldValues, looks up the analyzer mapped
    * to the field from the configured analysis schema, and uses it to perform analysis on the
    * each of the values.  Returns a map from the fields to the flattened concatenation of the
    * produced token sequences.
    */
  def analyzeMV(fieldValues: immutable.Map[String,Seq[String]]): immutable.Map[String,Seq[String]] = {
    val builder = immutable.Map.newBuilder[String,Seq[String]]
    for ((field, values) <- fieldValues) { builder += field -> analyzeMV(field, values) }
    builder.result()
  }
  /** Java-friendly version: looks up the analyzer mapped to the given field from the configured
    * analysis schema, uses it to perform analysis on the given string, returning the produced
    * token sequence. */
  def analyzeJava(field: String, str: String): java.util.List[String] = {
    seqAsJavaList(analyze(field, str))
  }
  /** Java-friendly version: looks up the analyzer mapped to the given field from the configured
    * analysis schema, uses it to perform analysis on the given reader, returning the produced
    * token sequence. */
  def analyzeJava(field: String, reader: Reader): java.util.List[String] = {
    seqAsJavaList(analyze(field, reader))
  }
  /** Java-friendly version: for each of the field->value pairs in fieldValues, looks up the
    * analyzer mapped to the field from the configured analysis schema, and uses it to perform
    * analysis on the value.  Returns a map from the fields to the produced token sequences.
    */
  def analyzeJava(fieldValues: java.util.Map[String,String]): java.util.Map[String,java.util.List[String]] = {
    val output = new java.util.HashMap[String,java.util.List[String]]()
    for ((field, value) <- fieldValues) output.put(field, analyzeJava(field, value))
    java.util.Collections.unmodifiableMap(output)
  }
  /** Java-friendly version: looks up the analyzer mapped to the given field from the configured
    * analysis schema, uses it to perform analysis on each of the given values, and returns the
    * flattened concatenation of the produced token sequence.
    */
  def analyzeMVJava(field: String, values: java.util.List[String]): java.util.List[String] = {
    if (values == null) return java.util.Collections.emptyList[String]()
    val output = new java.util.ArrayList[String]()
    values foreach { value => output.addAll(analyzeJava(field, value)) }
    output
  }
  /** Java-friendly version: for each of the field->multi-value pairs in fieldValues, looks up the
    * analyzer mapped to the field from the configured analysis schema, and uses it to perform
    * analysis on each of the values.  Returns a map from the fields to the flattened concatenation
    * of the produced token sequences.
    */
  def analyzeMVJava(fieldValues: java.util.Map[String,java.util.List[String]])
  : java.util.Map[String,java.util.List[String]] = {
    val output = new java.util.HashMap[String,java.util.List[String]]()
    for ((field, values) <- fieldValues) output.put(field, analyzeMVJava(field, values))
    java.util.Collections.unmodifiableMap(output)
  }
  def tokenStream(fieldName: String, text: String) = analyzerWrapper.tokenStream(fieldName, text)
  /** Looks up the analyzer mapped to `fieldName` and returns a [[org.apache.lucene.analysis.TokenStream]]
    * for the analyzer to tokenize the contents of `reader`. */
  def tokenStream(fieldName: String, reader: Reader) = analyzerWrapper.tokenStream(fieldName, reader)
  /** Looks up the analyzer mapped to the given field from the configured analysis schema,
    * uses it to perform analysis on the given string, and returns a PreAnalyzedField-compatible
    * JSON string with the following serialized attributes:
    *
    *   - CharTermAttribute (token text)
    *   - OffsetAttribute (start and end character offsets)
    *   - PositionIncrementAttribute (token position relative to the previous token)
    *
    * If stored = true, the original string input value will be included as a value to be stored.
    * (Note that the Solr schema for the destination Solr field must be configured to store the
    * value; if it is not, then the stored value included in the JSON will be ignored by Solr.)
    */
  def toPreAnalyzedJson(field: String, str: String, stored: Boolean): String = {
    toPreAnalyzedJson(tokenStream(field, str), if (stored) Some(str) else None)
  }
  /** Looks up the analyzer mapped to the given field from the configured analysis schema,
    * uses it to perform analysis on the given reader, and returns a PreAnalyzedField-compatible
    * JSON string with the following serialized attributes:
    *
    * - CharTermAttribute (token text),
    * - OffsetAttribute (start and end position)
    * - PositionIncrementAttribute (token position relative to the previous token)
    *
    * If stored = true, the original reader input value, read into a string, will be included as
    * a value to be stored. (Note that the Solr schema for the destination Solr field must be
    * configured to store the value; if it is not, then the stored value included in the JSON
    * will be ignored by Solr.)
    */
  def toPreAnalyzedJson(field: String, reader: Reader, stored: Boolean): String = {
    if (stored)
      toPreAnalyzedJson(field, IOUtils.toString(reader), stored = true)
    else
      toPreAnalyzedJson(tokenStream(field, reader), None)
  }
  private def toPreAnalyzedJson(stream: TokenStream, str: Option[String]): String = {
    // Implementation note: Solr's JsonPreAnalyzedParser.toFormattedString() produces JSON
    // suitable for use with PreAnalyzedField, but there are problems with using it with
    // CustomAnalyzer:
    //
    //   - toFormattedString() will serialize all attributes present on the passed-in token
    //     stream's AttributeSource, which is fixed by CustomAnalyzer at those in the
    //     PackedTokenAttributeImpl, which includes the PositionLengthAttribute and the
    //     TypeAttribute, neither of which are indexed, and so shouldn't be output
    //     (by default anyway) from this method.
    //   - To modify the set of attributes that CustomAnalyzer has in its AttributeSource,
    //     CustomAnalyzer can't be extended because it's final, so CustomAnalyzer's
    //     createComponents() method can't be overridden to pass in an alternate AttributeFactory
    //     to TokenizerFactory.create().  However, a wrapper can be constructed that forwards all
    //     methods except createComponents(), and then have createComponents() do the right thing.
    //   - Once an alternate AttributeFactory is used in an effectively overridden
    //     CustomAnalyzer.createComponents(), this form will be cached for future uses, but we
    //     don't want that, since it might conflict with the analyze*() methods' requirements,
    //     and future versions of toPreAnalyzedJson might allow for customization of attributes
    //     to output (including e.g. PayloadAttribute).  So we would have to either use an
    //     alternate cache, or not cache analyzers used by toPreAnalyzedJson(), both of which
    //     seem overcomplicated.
    //
    // The code below constructs JSON with a fixed set of serialized attributes.

    val termAtt = stream.addAttribute(classOf[CharTermAttribute])
    val offsetAtt = stream.addAttribute(classOf[OffsetAttribute])
    val posIncAtt = stream.addAttribute(classOf[PositionIncrementAttribute])
    var tokens = List.newBuilder[immutable.ListMap[String, Any]]
    val token = immutable.ListMap.newBuilder[String, Any]
    try {
      stream.reset()
      while (stream.incrementToken) {
        token.clear()
        token += (JsonPreAnalyzedParser.TOKEN_KEY -> new String(termAtt.buffer, 0, termAtt.length))
        token += (JsonPreAnalyzedParser.OFFSET_START_KEY -> offsetAtt.startOffset)
        token += (JsonPreAnalyzedParser.OFFSET_END_KEY -> offsetAtt.endOffset)
        token += (JsonPreAnalyzedParser.POSINCR_KEY -> posIncAtt.getPositionIncrement)
        tokens += token.result
      }
      stream.end()
    } finally {
      stream.close()
    }
    val topLevel = immutable.ListMap.newBuilder[String, Any]
    topLevel += (JsonPreAnalyzedParser.VERSION_KEY -> JsonPreAnalyzedParser.VERSION)
    if (str.isDefined) topLevel += (JsonPreAnalyzedParser.STRING_KEY -> str)
    topLevel += (JsonPreAnalyzedParser.TOKENS_KEY -> tokens.result)
    implicit val formats = org.json4s.DefaultFormats // required by Serialization.write()
    Serialization.write(topLevel.result)
  }
  @transient private lazy val analyzerWrapper = new AnalyzerWrapper
  private class AnalyzerWrapper extends DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
    override protected def getWrappedAnalyzer(field: String): Analyzer = {
      analyzerCache.synchronized {
        var analyzer = analyzerCache.get(field)
        if (analyzer.isEmpty) {
          if (isValid) analyzer = analyzerSchema.getAnalyzer(field)
          if ( ! isValid) throw new IllegalArgumentException(invalidMessages) // getAnalyzer can make isValid false
          if (analyzer.isEmpty) throw new IllegalArgumentException(s"No analyzer defined for field '$field'")
          analyzerCache.put(field, analyzer.get)
        }
        analyzer.get
      }
    }
  }
  private def analyze(inputStream: TokenStream): Seq[String] = {
    val builder = Seq.newBuilder[String]
    val charTermAttr = inputStream.addAttribute(classOf[CharTermAttribute])
    inputStream.reset()
    while (inputStream.incrementToken) builder += charTermAttr.toString
    inputStream.end()
    inputStream.close()
    builder.result()
  }
}

private class AnalyzerSchema(val analysisSchema: String) {
  implicit val formats = org.json4s.DefaultFormats    // enable extract
  val schemaConfig = parse(analysisSchema).extract[SchemaConfig]
  val analyzers = mutable.Map[String, Analyzer]()
  var isValid: Boolean = true
  var invalidMessages : StringBuilder = new StringBuilder()
  try {
    schemaConfig.defaultLuceneMatchVersion.foreach { version =>
      if ( ! LuceneVersion.parseLeniently(version).onOrAfter(LuceneVersion.LUCENE_7_0_0)) {
        isValid = false
        invalidMessages.append(
          s"""defaultLuceneMatchVersion "${schemaConfig.defaultLuceneMatchVersion}"""")
          .append(" is not on or after ").append(LuceneVersion.LUCENE_7_0_0).append("\n")
      }
    }
  } catch {
    case NonFatal(e) => isValid = false
      invalidMessages.append(e.getMessage).append("\n")
  }
  schemaConfig.fields.foreach { field =>
    if (field.name.isDefined) {
      if (field.regex.isDefined) {
        isValid = false
        invalidMessages.append("""Both "name" and "regex" keys are defined in a field,"""
          + " but only one may be.\n")
      }
    } else if (field.regex.isEmpty) {
      isValid = false
      invalidMessages.append("""Neither "name" nor "regex" key is defined in a field,""").
        append(" but one must be.\n")
    }
    if (schemaConfig.namedAnalyzerConfigs.get(field.analyzer).isEmpty) {
      def badAnalyzerMessage(suffix: String): Unit = {
        invalidMessages.append(s"""field "${field.fieldRef}": """)
          .append(s""" analyzer "${field.analyzer}" """).append(suffix)
      }
      try { // Attempt to interpret the analyzer as a fully qualified class name
        Utils.classForName(field.analyzer).asInstanceOf[Class[_ <: Analyzer]]
      } catch {
        case _: ClassNotFoundException => isValid = false
          badAnalyzerMessage("not found.\n")
        case _: ClassCastException => isValid = false
          badAnalyzerMessage("is not a subclass of org.apache.lucene.analysis.Analyzer")
      }
    }
  }
  def getAnalyzer(fieldName: String): Option[Analyzer] = {
    var analyzer: Option[Analyzer] = None
    if (isValid) {
      var fieldConfig = schemaConfig.namedFields.get(fieldName)
      if (fieldConfig.isEmpty) {
        breakable {
          schemaConfig.fields.filter(c => c.regex.isDefined).foreach { field =>
            if (field.pattern matcher fieldName matches()) {
              fieldConfig = Some(field)
              break
            }
          }
        }
      }
      if (fieldConfig.isDefined) {
        val analyzerConfig = schemaConfig.namedAnalyzerConfigs.get(fieldConfig.get.analyzer)
        if (analyzerConfig.isDefined) {
          analyzer = analyzers.get(analyzerConfig.get.name)
          if (analyzer.isEmpty) try {
            analyzer = Some(buildAnalyzer(analyzerConfig.get))
            analyzers.put(analyzerConfig.get.name, analyzer.get)
          } catch {
            case NonFatal(e) => isValid = false
              val writer = new StringWriter
              writer.write(s"Exception initializing analyzer '${analyzerConfig.get.name}': ")
              e.printStackTrace(new PrintWriter(writer))
              invalidMessages.append(writer.toString).append("\n")
          }
        } else {
          try {
            val clazz = Utils.classForName(fieldConfig.get.analyzer)
            analyzer = Some(clazz.newInstance.asInstanceOf[Analyzer])
            schemaConfig.defaultLuceneMatchVersion foreach { version =>
              analyzer.get.setVersion(LuceneVersion.parseLeniently(version))
            }
          } catch {
            case NonFatal(e) => isValid = false
              val writer = new StringWriter
              writer.write(s"Exception initializing analyzer '${fieldConfig.get.analyzer}': ")
              e.printStackTrace(new PrintWriter(writer))
              invalidMessages.append(writer.toString).append("\n")
          }
        }
      }
    }
    analyzer
  }
  private def buildAnalyzer(analyzerConfig: AnalyzerConfig): Analyzer = {
    var builder = CustomAnalyzer.builder()
    if (schemaConfig.defaultLuceneMatchVersion.isDefined) {
      builder = builder.withDefaultMatchVersion(
        LuceneVersion.parseLeniently(schemaConfig.defaultLuceneMatchVersion.get))
    }
    // Builder methods' param maps must be mutable to enable put("luceneMatchVersion", ...)
    if (analyzerConfig.charFilters.isDefined) {
      for (charFilter <- analyzerConfig.charFilters.get) {
        val charFilterNoType = mutable.Map[String, String]() ++ (charFilter - "type")
        builder = builder.addCharFilter(charFilter("type"), charFilterNoType)
      }
    }
    val tokenizerNoType = mutable.Map[String, String]() ++ (analyzerConfig.tokenizer - "type")
    builder = builder.withTokenizer(analyzerConfig.tokenizer("type"), tokenizerNoType)
    if (analyzerConfig.filters.isDefined) {
      for (filter <- analyzerConfig.filters.get) {
        val filterNoType = mutable.Map[String, String]() ++ (filter - "type")
        builder = builder.addTokenFilter(filter("type"), filterNoType)
      }
    }
    builder.build()
  }
}
private case class AnalyzerConfig(name: String,
                                  charFilters: Option[List[Map[String, String]]],
                                  tokenizer: Map[String, String],
                                  filters: Option[List[Map[String, String]]])
private case class FieldConfig(regex: Option[String], name: Option[String], analyzer: String) {
  val pattern: Pattern = regex.map(_.r.pattern).orNull
  val fieldRef: String = name.getOrElse(regex.get)
}
private case class SchemaConfig(defaultLuceneMatchVersion: Option[String],
                                analyzers: List[AnalyzerConfig],
                                fields: List[FieldConfig]) {
  val namedAnalyzerConfigs: Map[String, AnalyzerConfig] = analyzers.map(a => a.name -> a).toMap
  val namedFields: Map[String, FieldConfig]
    = fields.filter(c => c.name.isDefined).map(c => c.name.get -> c).toMap
}