All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.preprocessing.Tokenizer Maven / Gradle / Ivy


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.carrot2.core.Document;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.Init;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.preprocessing.PreprocessingContext.AllFields;
import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.CharArrayUtils;
import org.carrot2.util.ExceptionUtils;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;

import com.carrotsearch.hppc.ByteArrayList;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.ShortArrayList;
import org.carrot2.shaded.guava.common.collect.Lists;

/**
 * Performs tokenization of documents.
 * 

* This class saves the following results to the {@link PreprocessingContext}: *

    *
  • {@link AllTokens#image}
  • *
  • {@link AllTokens#documentIndex}
  • *
  • {@link AllTokens#fieldIndex}
  • *
  • {@link AllTokens#type}
  • *
*/ @Bindable(prefix = "Tokenizer") public final class Tokenizer { /** * Textual fields of documents that should be tokenized and parsed for clustering. */ @Init @Input @Attribute @Label("Document fields") @Level(AttributeLevel.ADVANCED) @Group(DefaultGroups.PREPROCESSING) public Collection documentFields = Arrays.asList(new String [] { Document.TITLE, Document.SUMMARY }); /** * Token images. */ private ArrayList images; /** * An array of token types. * * @see ITokenizer */ private ShortArrayList tokenTypes; /** * An array of document indexes. */ private IntArrayList documentIndices; /** * An array of field indexes. * * @see AllFields */ private ByteArrayList fieldIndices; /** * Performs tokenization and saves the results to the context. */ public void tokenize(PreprocessingContext context) { // Documents to tokenize final List documents = context.documents; // Fields to tokenize final String [] fieldNames = documentFields.toArray(new String [documentFields.size()]); if (fieldNames.length > 8) { throw new ProcessingException("Maximum number of tokenized fields is 8."); } // Prepare arrays images = Lists.newArrayList(); tokenTypes = new ShortArrayList(); documentIndices = new IntArrayList(); fieldIndices = new ByteArrayList(); final Iterator docIterator = documents.iterator(); int documentIndex = 0; final ITokenizer ts = context.language.getTokenizer(); final MutableCharArray wrapper = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY); while (docIterator.hasNext()) { final Document doc = docIterator.next(); boolean hadTokens = false; for (int i = 0; i < fieldNames.length; i++) { final byte fieldIndex = (byte) i; final String fieldName = fieldNames[i]; final String fieldValue = doc.getField(fieldName); if (!StringUtils.isEmpty(fieldValue)) { try { short tokenType; ts.reset(new StringReader(fieldValue)); if ((tokenType = ts.nextToken()) != ITokenizer.TT_EOF) { if (hadTokens) addFieldSeparator(documentIndex); do { ts.setTermBuffer(wrapper); add(documentIndex, fieldIndex, context.intern(wrapper), tokenType); } while ( (tokenType = ts.nextToken()) != ITokenizer.TT_EOF); hadTokens = true; } } catch (IOException e) { // Not possible (StringReader above)? throw ExceptionUtils.wrapAsRuntimeException(e); } } } if (docIterator.hasNext()) { addDocumentSeparator(); } documentIndex++; } addTerminator(); // Save results in the PreprocessingContext context.allTokens.documentIndex = documentIndices.toArray(); context.allTokens.fieldIndex = fieldIndices.toArray(); context.allTokens.image = images.toArray(new char [images.size()] []); context.allTokens.type = tokenTypes.toArray(); context.allFields.name = fieldNames; // Clean up images = null; fieldIndices = null; tokenTypes = null; documentIndices = null; } /** * Adds a special terminating token required at the very end of all documents. */ void addTerminator() { add(-1, (byte) -1, null, ITokenizer.TF_TERMINATOR); } /** * Adds a document separator to the lists. */ void addDocumentSeparator() { add(-1, (byte) -1, null, ITokenizer.TF_SEPARATOR_DOCUMENT); } /** * Adds a field separator to the lists. */ void addFieldSeparator(int documentIndex) { add(documentIndex, (byte) -1, null, ITokenizer.TF_SEPARATOR_FIELD); } /** * Adds a sentence separator to the lists. */ void addSentenceSeparator(int documentIndex, byte fieldIndex) { add(documentIndex, fieldIndex, null, ITokenizer.TF_SEPARATOR_FIELD); } /** * Adds custom token code to the sequence. May be used to add separator constants. */ void add(int documentIndex, byte fieldIndex, char [] image, short tokenTypeCode) { documentIndices.add(documentIndex); fieldIndices.add(fieldIndex); images.add(image); tokenTypes.add(tokenTypeCode); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy