All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.linguistic.lucene.HindiNormalizer Maven / Gradle / Ivy


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.linguistic.lucene;

/* 
 * Imported from Apache Lucene.
 * 
 * https://svn.apache.org/repos/asf/lucene/dev/trunk
 * svn rev.: 1534186
 */

import static org.carrot2.text.linguistic.lucene.StemmerUtil.*;

/**
 * Normalizer for Hindi.
 * 

* Normalizes text to remove some differences in spelling variations. *

* Implements the Hindi-language specific algorithm specified in: * Word normalization in Indian languages * Prasad Pingali and Vasudeva Varma. * http://web2py.iiit.ac.in/publications/default/download/inproceedings.pdf.3fe5b38c-02ee-41ce-9a8f-3e745670be32.pdf *

* with the following additions from Hindi CLIR in Thirty Days * Leah S. Larkey, Margaret E. Connell, and Nasreen AbdulJaleel. * http://maroo.cs.umass.edu/pub/web/getpdf.php?id=454: *

    *
  • Internal Zero-width joiner and Zero-width non-joiners are removed *
  • In addition to chandrabindu, NA+halant is normalized to anusvara *
* */ class HindiNormalizer { /** * Normalize an input buffer of Hindi text * * @param s input buffer * @param len length of input buffer * @return length of input buffer after normalization */ public int normalize(char s[], int len) { for (int i = 0; i < len; i++) { switch (s[i]) { // dead n -> bindu case '\u0928': if (i + 1 < len && s[i + 1] == '\u094D') { s[i] = '\u0902'; len = delete(s, i + 1, len); } break; // candrabindu -> bindu case '\u0901': s[i] = '\u0902'; break; // nukta deletions case '\u093C': len = delete(s, i, len); i--; break; case '\u0929': s[i] = '\u0928'; break; case '\u0931': s[i] = '\u0930'; break; case '\u0934': s[i] = '\u0933'; break; case '\u0958': s[i] = '\u0915'; break; case '\u0959': s[i] = '\u0916'; break; case '\u095A': s[i] = '\u0917'; break; case '\u095B': s[i] = '\u091C'; break; case '\u095C': s[i] = '\u0921'; break; case '\u095D': s[i] = '\u0922'; break; case '\u095E': s[i] = '\u092B'; break; case '\u095F': s[i] = '\u092F'; break; // zwj/zwnj -> delete case '\u200D': case '\u200C': len = delete(s, i, len); i--; break; // virama -> delete case '\u094D': len = delete(s, i, len); i--; break; // chandra/short -> replace case '\u0945': case '\u0946': s[i] = '\u0947'; break; case '\u0949': case '\u094A': s[i] = '\u094B'; break; case '\u090D': case '\u090E': s[i] = '\u090F'; break; case '\u0911': case '\u0912': s[i] = '\u0913'; break; case '\u0972': s[i] = '\u0905'; break; // long -> short ind. vowels case '\u0906': s[i] = '\u0905'; break; case '\u0908': s[i] = '\u0907'; break; case '\u090A': s[i] = '\u0909'; break; case '\u0960': s[i] = '\u090B'; break; case '\u0961': s[i] = '\u090C'; break; case '\u0910': s[i] = '\u090F'; break; case '\u0914': s[i] = '\u0913'; break; // long -> short dep. vowels case '\u0940': s[i] = '\u093F'; break; case '\u0942': s[i] = '\u0941'; break; case '\u0944': s[i] = '\u0943'; break; case '\u0963': s[i] = '\u0962'; break; case '\u0948': s[i] = '\u0947'; break; case '\u094C': s[i] = '\u094B'; break; default: break; } } return len; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy