org.apache.solr.analysis.SynonymFilterFactory Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr Server
The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.analysis;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * @version $Id: SynonymFilterFactory.java 712457 2008-11-09 01:24:11Z koji $
 */
public class SynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {

  public void inform(ResourceLoader loader) {
    String synonyms = args.get("synonyms");

    boolean ignoreCase = getBoolean("ignoreCase", false);
    boolean expand = getBoolean("expand", true);

    String tf = args.get("tokenizerFactory");
    TokenizerFactory tokFactory = null;
    if( tf != null ){
      tokFactory = loadTokenizerFactory( loader, tf, args );
    }

    if (synonyms != null) {
      List wlist=null;
      try {
        File synonymFile = new File(synonyms);
        if (synonymFile.exists()) {
          wlist = loader.getLines(synonyms);
        } else  {
          List files = StrUtils.splitFileNames(synonyms);
          wlist = new ArrayList();
          for (String file : files) {
            List lines = loader.getLines(file.trim());
            wlist.addAll(lines);
          }
        }
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
      synMap = new SynonymMap(ignoreCase);
      parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
    }
  }

  private SynonymMap synMap;

  static void parseRules(List rules, SynonymMap map, String mappingSep,
    String synSep, boolean expansion, TokenizerFactory tokFactory) {
    int count=0;
    for (String rule : rules) {
      // To use regexes, we need an expression that specifies an odd number of chars.
      // This can't really be done with string.split(), and since we need to
      // do unescaping at some point anyway, we wouldn't be saving any effort
      // by using regexes.

      List mapping = StrUtils.splitSmart(rule, mappingSep, false);

      List> source;
      List> target;

      if (mapping.size() > 2) {
        throw new RuntimeException("Invalid Synonym Rule:" + rule);
      } else if (mapping.size()==2) {
        source = getSynList(mapping.get(0), synSep, tokFactory);
        target = getSynList(mapping.get(1), synSep, tokFactory);
      } else {
        source = getSynList(mapping.get(0), synSep, tokFactory);
        if (expansion) {
          // expand to all arguments
          target = source;
        } else {
          // reduce to first argument
          target = new ArrayList>(1);
          target.add(source.get(0));
        }
      }

      boolean includeOrig=false;
      for (List fromToks : source) {
        count++;
        for (List toToks : target) {
          map.add(fromToks,
                  SynonymMap.makeTokens(toToks),
                  includeOrig,
                  true
          );
        }
      }
    }
  }

  // a , b c , d e f => [[a],[b,c],[d,e,f]]
  private static List> getSynList(String str, String separator, TokenizerFactory tokFactory) {
    List strList = StrUtils.splitSmart(str, separator, false);
    // now split on whitespace to get a list of token strings
    List> synList = new ArrayList>();
    for (String toks : strList) {
      List tokList = tokFactory == null ?
        StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
      synList.add(tokList);
    }
    return synList;
  }

  private static List splitByTokenizer(String source, TokenizerFactory tokFactory){
    StringReader reader = new StringReader( source );
    TokenStream ts = loadTokenizer(tokFactory, reader);
    List tokList = new ArrayList();
    try {
      for( Token token = ts.next(); token != null; token = ts.next() ){
        String text = new String(token.termBuffer(), 0, token.termLength());
        if( text.length() > 0 )
          tokList.add( text );
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    finally{
      reader.close();
    }
    return tokList;
  }

  private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){
    TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
    tokFactory.init( args );
    return tokFactory;
  }

  private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
    return tokFactory.create( reader );
  }

  public SynonymMap getSynonymMap() {
    return synMap;
  }

  public SynonymFilter create(TokenStream input) {
    return new SynonymFilter(input,synMap);
  }
}