org.apache.lucene.analysis.fr.FrenchStemmer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers Show documentation
Additional Analyzers
There is a newer version: 3.6.2
package org.apache.lucene.analysis.fr;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * A stemmer for French words. 
 * 
 * The algorithm is based on the work of
 * Dr Martin Porter on his snowball project

 * refer to http://snowball.sourceforge.net/french/stemmer.html

 * (French stemming algorithm) for details
 * 
 */

public class FrenchStemmer {

    /**
     * Buffer for the terms while stemming them.
     */
    private StringBuilder sb = new StringBuilder();

    /**
     * A temporary buffer, used to reconstruct R2
     */
     private StringBuilder tb = new StringBuilder();

	/**
	 * Region R0 is equal to the whole buffer
	 */
	private String R0;

	/**
	 * Region RV
	 * "If the word begins with two vowels, RV is the region after the third letter,
	 * otherwise the region after the first vowel not at the beginning of the word,
	 * or the end of the word if these positions cannot be found."
	 */
    private String RV;

	/**
	 * Region R1
	 * "R1 is the region after the first non-vowel following a vowel
	 * or is the null region at the end of the word if there is no such non-vowel"
	 */
    private String R1;

	/**
	 * Region R2
	 * "R2 is the region after the first non-vowel in R1 following a vowel
	 * or is the null region at the end of the word if there is no such non-vowel"
	 */
    private String R2;


	/**
	 * Set to true if we need to perform step 2
	 */
    private boolean suite;

	/**
	 * Set to true if the buffer was modified
	 */
    private boolean modified;


    /**
     * Stems the given term to a unique discriminator.
     *
     * @param term  java.langString The term that should be stemmed
     * @return java.lang.String  Discriminator for term
     */
    protected String stem( String term ) {
		if ( !isStemmable( term ) ) {
			return term;
		}

		// Use lowercase for medium stemming.
		term = term.toLowerCase();

		// Reset the StringBuilder.
		sb.delete( 0, sb.length() );
		sb.insert( 0, term );

		// reset the booleans
		modified = false;
		suite = false;

		sb = treatVowels( sb );

		setStrings();

		step1();

		if (!modified || suite)
		{
			if (RV != null)
			{
				suite = step2a();
				if (!suite)
					step2b();
			}
		}

		if (modified || suite)
			step3();
		else
			step4();

		step5();

		step6();

		return sb.toString();
    }

	/**
	 * Sets the search region Strings

	 * it needs to be done each time the buffer was modified
	 */
	private void setStrings() {
		// set the strings
		R0 = sb.toString();
		RV = retrieveRV( sb );
		R1 = retrieveR( sb );
		if ( R1 != null )
		{
			tb.delete( 0, tb.length() );
			tb.insert( 0, R1 );
			R2 = retrieveR( tb );
		}
		else
			R2 = null;
	}

	/**
	 * First step of the Porter Algorithm

	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
	 */
	private void step1( ) {
		String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
		deleteFrom( R2, suffix );

		replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
		replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
		replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );

		String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
		deleteButSuffixFromElseReplace( R2, search, "ic",  true, R0, "iqU" );

		deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );

		deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
		deleteFrom( RV, new String[] { "ements", "ement" } );

		deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
		deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
		deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );

		String[] autre = { "ifs", "ives", "if", "ive" };
		deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
		deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );

		replaceFrom( R0, new String[] { "eaux" }, "eau" );

		replaceFrom( R1, new String[] { "aux" }, "al" );

		deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );

		deleteFrom( R2, new String[] { "eux" } );

		// if one of the next steps is performed, we will need to perform step2a
		boolean temp = false;
		temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
		if (temp == true)
			suite = true;
		temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
		if (temp == true)
			suite = true;
		temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
		if (temp == true)
			suite = true;

	}

	/**
	 * Second step (A) of the Porter Algorithm

	 * Will be performed if nothing changed from the first step
	 * or changed were done in the amment, emment, ments or ment suffixes

	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
	 *
	 * @return boolean - true if something changed in the StringBuilder
	 */
	private boolean step2a() {
		String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
							"irent", "iriez", "irez", "irions", "irons", "iront",
							"issaIent", "issais", "issantes", "issante", "issants", "issant",
							"issait", "issais", "issions", "issons", "issiez", "issez", "issent",
							"isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
		return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
	}

	/**
	 * Second step (B) of the Porter Algorithm

	 * Will be performed if step 2 A was performed unsuccessfully

	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
	 */
	private void step2b() {
		String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
							"erons", "eront","erez", "èrent", "era", "ées", "iez",
							"ée", "és", "er", "ez", "é" };
		deleteFrom( RV, suffix );

		String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
							"antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
							"ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
		deleteButSuffixFrom( RV, search, "e", true );

		deleteFrom( R2, new String[] { "ions" } );
	}

	/**
	 * Third step of the Porter Algorithm

	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
	 */
	private void step3() {
		if (sb.length()>0)
		{
			char ch = sb.charAt( sb.length()-1 );
			if (ch == 'Y')
			{
				sb.setCharAt( sb.length()-1, 'i' );
				setStrings();
			}
			else if (ch == 'ç')
			{
				sb.setCharAt( sb.length()-1, 'c' );
				setStrings();
			}
		}
	}

	/**
	 * Fourth step of the Porter Algorithm

	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
	 */
	private void step4() {
		if (sb.length() > 1)
		{
			char ch = sb.charAt( sb.length()-1 );
			if (ch == 's')
			{
				char b = sb.charAt( sb.length()-2 );
				if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
				{
					sb.delete( sb.length() - 1, sb.length());
					setStrings();
				}
			}
		}
		boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
		if (!found)
		found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );

		replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
		deleteFrom( RV, new String[] { "e" } );
		deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
	}

	/**
	 * Fifth step of the Porter Algorithm

	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
	 */
	private void step5() {
		if (R0 != null)
		{
			if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
			{
				sb.delete( sb.length() - 1, sb.length() );
				setStrings();
			}
		}
	}

	/**
	 * Sixth (and last!) step of the Porter Algorithm

	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
	 */
	private void step6() {
		if (R0!=null && R0.length()>0)
		{
			boolean seenVowel = false;
			boolean seenConson = false;
			int pos = -1;
			for (int i = R0.length()-1; i > -1; i--)
			{
				char ch = R0.charAt(i);
				if (isVowel(ch))
				{
					if (!seenVowel)
					{
						if (ch == 'é' || ch == 'è')
						{
							pos = i;
							break;
						}
					}
					seenVowel = true;
				}
				else
				{
					if (seenVowel)
						break;
					else
						seenConson = true;
				}
			}
			if (pos > -1 && seenConson && !seenVowel)
				sb.setCharAt(pos, 'e');
		}
	}

	/**
	 * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
	 *
	 * @param source java.lang.String - the primary source zone for search
	 * @param search java.lang.String[] - the strings to search for suppression
	 * @param from java.lang.String - the secondary source zone for search
	 * @param prefix java.lang.String - the prefix to add to the search string to test
	 * @return boolean - true if modified
	 */
	private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
		boolean found = false;
		if (source!=null )
		{
			for (int i = 0; i < search.length; i++) {
				if ( source.endsWith( search[i] ))
				{
					if (from!=null && from.endsWith( prefix + search[i] ))
					{
						sb.delete( sb.length() - search[i].length(), sb.length());
						found = true;
						setStrings();
						break;
					}
				}
			}
		}
		return found;
	}

	/**
	 * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
	 *
	 * @param source java.lang.String - the primary source zone for search
	 * @param search java.lang.String[] - the strings to search for suppression
	 * @param vowel boolean - true if we need a vowel before the search string
	 * @param from java.lang.String - the secondary source zone for search (where vowel could be)
	 * @return boolean - true if modified
	 */
	private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) {
		boolean found = false;
		if (source!=null && from!=null)
		{
			for (int i = 0; i < search.length; i++) {
				if ( source.endsWith( search[i] ))
				{
					if ((search[i].length() + 1) <= from.length())
					{
						boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
						if (test == vowel)
						{
							sb.delete( sb.length() - search[i].length(), sb.length());
							modified = true;
							found = true;
							setStrings();
							break;
						}
					}
				}
			}
		}
		return found;
	}

	/**
	 * Delete a suffix searched in zone "source" if preceded by the prefix
	 *
	 * @param source java.lang.String - the primary source zone for search
	 * @param search java.lang.String[] - the strings to search for suppression
	 * @param prefix java.lang.String - the prefix to add to the search string to test
	 * @param without boolean - true if it will be deleted even without prefix found
	 */
	private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) {
		if (source!=null)
		{
			for (int i = 0; i < search.length; i++) {
				if ( source.endsWith( prefix + search[i] ))
				{
					sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
					modified = true;
					setStrings();
					break;
				}
				else if ( without && source.endsWith( search[i] ))
				{
					sb.delete( sb.length() - search[i].length(), sb.length() );
					modified = true;
					setStrings();
					break;
				}
			}
		}
	}

	/**
	 * Delete a suffix searched in zone "source" if preceded by prefix

	 * or replace it with the replace string if preceded by the prefix in the zone "from"

	 * or delete the suffix if specified
	 *
	 * @param source java.lang.String - the primary source zone for search
	 * @param search java.lang.String[] - the strings to search for suppression
	 * @param prefix java.lang.String - the prefix to add to the search string to test
	 * @param without boolean - true if it will be deleted even without prefix found
	 */
	private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) {
		if (source!=null)
		{
			for (int i = 0; i < search.length; i++) {
				if ( source.endsWith( prefix + search[i] ))
				{
					sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
					modified = true;
					setStrings();
					break;
				}
				else if ( from!=null && from.endsWith( prefix + search[i] ))
				{
					sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
					modified = true;
					setStrings();
					break;
				}
				else if ( without && source.endsWith( search[i] ))
				{
					sb.delete( sb.length() - search[i].length(), sb.length() );
					modified = true;
					setStrings();
					break;
				}
			}
		}
	}

	/**
	 * Replace a search string with another within the source zone
	 *
	 * @param source java.lang.String - the source zone for search
	 * @param search java.lang.String[] - the strings to search for replacement
	 * @param replace java.lang.String - the replacement string
	 */
	private boolean replaceFrom( String source, String[] search, String replace ) {
		boolean found = false;
		if (source!=null)
		{
			for (int i = 0; i < search.length; i++) {
				if ( source.endsWith( search[i] ))
				{
					sb.replace( sb.length() - search[i].length(), sb.length(), replace );
					modified = true;
					found = true;
					setStrings();
					break;
				}
			}
		}
		return found;
	}

	/**
	 * Delete a search string within the source zone
	 *
	 * @param source the source zone for search
	 * @param suffix the strings to search for suppression
	 */
	private void deleteFrom(String source, String[] suffix ) {
		if (source!=null)
		{
			for (int i = 0; i < suffix.length; i++) {
				if (source.endsWith( suffix[i] ))
				{
					sb.delete( sb.length() - suffix[i].length(), sb.length());
					modified = true;
					setStrings();
					break;
				}
			}
		}
	}

	/**
	 * Test if a char is a french vowel, including accentuated ones
	 *
	 * @param ch the char to test
	 * @return boolean - true if the char is a vowel
	 */
	private boolean isVowel(char ch) {
		switch (ch)
		{
			case 'a':
			case 'e':
			case 'i':
			case 'o':
			case 'u':
			case 'y':
			case 'â':
			case 'à':
			case 'ë':
			case 'é':
			case 'ê':
			case 'è':
			case 'ï':
			case 'î':
			case 'ô':
			case 'ü':
			case 'ù':
			case 'û':
				return true;
			default:
				return false;
		}
	}

	/**
	 * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string

	 * "R is the region after the first non-vowel following a vowel
	 * or is the null region at the end of the word if there is no such non-vowel"

	 * @param buffer java.lang.StringBuilder - the in buffer
	 * @return java.lang.String - the resulting string
	 */
	private String retrieveR( StringBuilder buffer ) {
		int len = buffer.length();
		int pos = -1;
		for (int c = 0; c < len; c++) {
			if (isVowel( buffer.charAt( c )))
			{
				pos = c;
				break;
			}
		}
		if (pos > -1)
		{
			int consonne = -1;
			for (int c = pos; c < len; c++) {
				if (!isVowel(buffer.charAt( c )))
				{
					consonne = c;
					break;
				}
			}
			if (consonne > -1 && (consonne+1) < len)
				return buffer.substring( consonne+1, len );
			else
				return null;
		}
		else
			return null;
	}

	/**
	 * Retrieve the "RV zone" from a buffer an return the corresponding string

	 * "If the word begins with two vowels, RV is the region after the third letter,
	 * otherwise the region after the first vowel not at the beginning of the word,
	 * or the end of the word if these positions cannot be found."

	 * @param buffer java.lang.StringBuilder - the in buffer
	 * @return java.lang.String - the resulting string
	 */
	private String retrieveRV( StringBuilder buffer ) {
		int len = buffer.length();
		if ( buffer.length() > 3)
		{
			if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
				return buffer.substring(3,len);
			}
			else
			{
				int pos = 0;
				for (int c = 1; c < len; c++) {
					if (isVowel( buffer.charAt( c )))
					{
						pos = c;
						break;
					}
				}
				if ( pos+1 < len )
					return buffer.substring( pos+1, len );
				else
					return null;
			}
		}
		else
			return null;
	}



    /**
	 * Turns u and i preceded AND followed by a vowel to UpperCase

	 * Turns y preceded OR followed by a vowel to UpperCase

	 * Turns u preceded by q to UpperCase

     *
     * @param buffer java.util.StringBuilder - the buffer to treat
     * @return java.util.StringBuilder - the treated buffer
     */
    private StringBuilder treatVowels( StringBuilder buffer ) {
		for ( int c = 0; c < buffer.length(); c++ ) {
			char ch = buffer.charAt( c );

			if (c == 0) // first char
			{
				if (buffer.length()>1)
				{
					if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
						buffer.setCharAt( c, 'Y' );
				}
			}
			else if (c == buffer.length()-1) // last char
			{
				if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
					buffer.setCharAt( c, 'U' );
				if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
					buffer.setCharAt( c, 'Y' );
			}
			else // other cases
			{
				if (ch == 'u')
				{
					if (buffer.charAt( c - 1) == 'q')
						buffer.setCharAt( c, 'U' );
					else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
						buffer.setCharAt( c, 'U' );
				}
				if (ch == 'i')
				{
					if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
						buffer.setCharAt( c, 'I' );
				}
				if (ch == 'y')
				{
					if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
						buffer.setCharAt( c, 'Y' );
				}
			}
		}

		return buffer;
    }

    /**
     * Checks a term if it can be processed correctly.
     *
     * @return boolean - true if, and only if, the given term consists in letters.
     */
    private boolean isStemmable( String term ) {
		boolean upper = false;
		int first = -1;
		for ( int c = 0; c < term.length(); c++ ) {
			// Discard terms that contain non-letter characters.
			if ( !Character.isLetter( term.charAt( c ) ) ) {
				return false;
			}
			// Discard terms that contain multiple uppercase letters.
			if ( Character.isUpperCase( term.charAt( c ) ) ) {
				if ( upper ) {
					return false;
				}
			// First encountered uppercase letter, set flag and save
			// position.
				else {
					first = c;
					upper = true;
				}
			}
		}
		// Discard the term if it contains a single uppercase letter that
		// is not starting the term.
		if ( first > 0 ) {
			return false;
		}
		return true;
    }
}